Advertisement
Guest User

Cuda xbmc

a guest
Jul 10th, 2012
1,848
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 456.39 KB | None | 0 0
  1. diff --git a/lib/DllAvUtil.h b/lib/DllAvUtil.h
  2. index e882cac..7afc9af 100644
  3. --- a/lib/DllAvUtil.h
  4. +++ b/lib/DllAvUtil.h
  5. @@ -96,6 +96,7 @@ public:
  6. virtual int av_fifo_size(AVFifoBuffer *f) = 0;
  7. virtual int av_fifo_generic_read(AVFifoBuffer *f, void *dest, int buf_size, void (*func)(void*, void*, int)) = 0;
  8. virtual int av_fifo_generic_write(AVFifoBuffer *f, void *src, int size, int (*func)(void*, void*, int)) = 0;
  9. + virtual int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max) = 0;
  10. virtual char *av_strdup(const char *s)=0;
  11. };
  12.  
  13. @@ -167,6 +168,7 @@ class DllAvUtilBase : public DllDynamic, DllAvUtilInterface
  14. DEFINE_METHOD1(int, av_fifo_size, (AVFifoBuffer *p1))
  15. DEFINE_METHOD4(int, av_fifo_generic_read, (AVFifoBuffer *p1, void *p2, int p3, void (*p4)(void*, void*, int)))
  16. DEFINE_METHOD4(int, av_fifo_generic_write, (AVFifoBuffer *p1, void *p2, int p3, int (*p4)(void*, void*, int)))
  17. + DEFINE_METHOD5(int, av_reduce, (int *p1, int *p2, int64_t p3, int64_t p4, int64_t p5))
  18. DEFINE_METHOD1(char*, av_strdup, (const char *p1))
  19.  
  20. public:
  21. @@ -188,6 +190,7 @@ class DllAvUtilBase : public DllDynamic, DllAvUtilInterface
  22. RESOLVE_METHOD(av_fifo_size)
  23. RESOLVE_METHOD(av_fifo_generic_read)
  24. RESOLVE_METHOD(av_fifo_generic_write)
  25. + RESOLVE_METHOD(av_reduce)
  26. RESOLVE_METHOD(av_strdup)
  27. END_METHOD_RESOLVE()
  28. };
  29. diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
  30. index 24f3ea6..e269339 100644
  31. --- a/project/VS2010Express/XBMC.vcxproj
  32. +++ b/project/VS2010Express/XBMC.vcxproj
  33. @@ -302,6 +302,13 @@
  34. <ClCompile Include="..\..\xbmc\AutoSwitch.cpp" />
  35. <ClCompile Include="..\..\xbmc\BackgroundInfoLoader.cpp" />
  36. <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CrystalHD.cpp" />
  37. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.cpp" />
  38. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.cpp" />
  39. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.cpp" />
  40. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.cpp" />
  41. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.cpp" />
  42. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.cpp" />
  43. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.cpp" />
  44. <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDInputStreams\DVDInputStreamBluray.cpp" />
  45. <ClCompile Include="..\..\xbmc\cores\VideoRenderers\RenderCapture.cpp" />
  46. <ClCompile Include="..\..\xbmc\cores\VideoRenderers\VideoShaders\WinVideoFilter.cpp" />
  47. @@ -1196,6 +1203,17 @@
  48. <ClInclude Include="..\..\xbmc\AutoSwitch.h" />
  49. <ClInclude Include="..\..\xbmc\BackgroundInfoLoader.h" />
  50. <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CrystalHD.h" />
  51. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.h" />
  52. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.h" />
  53. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.h" />
  54. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda.h" />
  55. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda_dynlink.h" />
  56. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuviddec.h" />
  57. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.h" />
  58. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.h" />
  59. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.h" />
  60. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\nvcuvid.h" />
  61. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.h" />
  62. <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDInputStreams\DVDInputStreamBluray.h" />
  63. <ClInclude Include="..\..\xbmc\cores\VideoRenderers\RenderCapture.h" />
  64. <ClInclude Include="..\..\xbmc\cores\VideoRenderers\VideoShaders\WinVideoFilter.h" />
  65. diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
  66. index 3a37750..770e509 100644
  67. --- a/project/VS2010Express/XBMC.vcxproj.filters
  68. +++ b/project/VS2010Express/XBMC.vcxproj.filters
  69. @@ -238,6 +238,9 @@
  70. <Filter Include="interfaces\info">
  71. <UniqueIdentifier>{cea579fc-bdd7-499e-a6a6-07d681d1ab24}</UniqueIdentifier>
  72. </Filter>
  73. + <Filter Include="cores\dvdplayer\DVDCodecs\Video\Cuda">
  74. + <UniqueIdentifier>{2affa4cc-9f39-42d9-97cc-4f595a6c2aa9}</UniqueIdentifier>
  75. + </Filter>
  76. </ItemGroup>
  77. <ItemGroup>
  78. <ClCompile Include="..\..\xbmc\win32\pch.cpp">
  79. @@ -2493,6 +2496,27 @@
  80. <ClCompile Include="..\..\xbmc\guilib\GUIAction.cpp">
  81. <Filter>guilib</Filter>
  82. </ClCompile>
  83. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.cpp">
  84. + <Filter>cores\dvdplayer\DVDCodecs\Video</Filter>
  85. + </ClCompile>
  86. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.cpp">
  87. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  88. + </ClCompile>
  89. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.cpp">
  90. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  91. + </ClCompile>
  92. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.cpp">
  93. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  94. + </ClCompile>
  95. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.cpp">
  96. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  97. + </ClCompile>
  98. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.cpp">
  99. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  100. + </ClCompile>
  101. + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.cpp">
  102. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  103. + </ClCompile>
  104. </ItemGroup>
  105. <ItemGroup>
  106. <ClInclude Include="..\..\xbmc\win32\pch.h">
  107. @@ -4976,8 +5000,8 @@
  108. <ClInclude Include="..\..\xbmc\threads\ThreadLocal.h">
  109. <Filter>threads</Filter>
  110. </ClInclude>
  111. - <ClInclude Include="..\..\xbmc\input\InertialScrollingHandler.h" >
  112. - <Filter>input</Filter>
  113. + <ClInclude Include="..\..\xbmc\input\InertialScrollingHandler.h">
  114. + <Filter>input</Filter>
  115. </ClInclude>
  116. <ClInclude Include="..\..\xbmc\threads\platform\Condition.h">
  117. <Filter>threads\platform</Filter>
  118. @@ -5006,6 +5030,39 @@
  119. <ClInclude Include="..\..\xbmc\guilib\GUIAction.h">
  120. <Filter>guilib</Filter>
  121. </ClInclude>
  122. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.h">
  123. + <Filter>cores\dvdplayer\DVDCodecs\Video</Filter>
  124. + </ClInclude>
  125. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\nvcuvid.h">
  126. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  127. + </ClInclude>
  128. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuviddec.h">
  129. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  130. + </ClInclude>
  131. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda_dynlink.h">
  132. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  133. + </ClInclude>
  134. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda.h">
  135. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  136. + </ClInclude>
  137. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.h">
  138. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  139. + </ClInclude>
  140. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.h">
  141. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  142. + </ClInclude>
  143. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.h">
  144. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  145. + </ClInclude>
  146. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.h">
  147. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  148. + </ClInclude>
  149. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.h">
  150. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  151. + </ClInclude>
  152. + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.h">
  153. + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
  154. + </ClInclude>
  155. </ItemGroup>
  156. <ItemGroup>
  157. <ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
  158. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
  159. index 03f6dcc..50ac74c 100644
  160. --- a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
  161. +++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
  162. @@ -37,6 +37,9 @@
  163. #if defined(HAVE_LIBCRYSTALHD)
  164. #include "Video/DVDVideoCodecCrystalHD.h"
  165. #endif
  166. +#if defined(HAS_DX)
  167. +#include "Video/CUDA.h"
  168. +#endif
  169. #include "Audio/DVDAudioCodecFFmpeg.h"
  170. #include "Audio/DVDAudioCodecLibMad.h"
  171. #include "Audio/DVDAudioCodecPcm.h"
  172. @@ -236,7 +239,10 @@ CDVDVideoCodec* CDVDFactoryCodec::CreateVideoCodec( CDVDStreamInfo &hint )
  173. }
  174. }
  175. #endif
  176. -
  177. + //Cuda
  178. +#if defined(HAS_DX)
  179. + if( (pCodec = OpenCodec(new CUDA::CDVDVideoCodecCuda(), hint, options)) ) return pCodec;
  180. +#endif
  181. // try to decide if we want to try halfres decoding
  182. #if !defined(_LINUX) && !defined(_WIN32)
  183. float pixelrate = (float)hint.width*hint.height*hint.fpsrate/hint.fpsscale;
  184. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h
  185. index 25ebcd7..3d1e9c7 100644
  186. --- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h
  187. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h
  188. @@ -32,6 +32,7 @@
  189. #define FRAME_TYPE_B 3
  190. #define FRAME_TYPE_D 4
  191.  
  192. +namespace CUDA { class CCuda; }
  193. namespace DXVA { class CProcessor; }
  194. namespace VAAPI { struct CHolder; }
  195. class CVDPAU;
  196. @@ -55,6 +56,9 @@ struct DVDVideoPicture
  197. BYTE* data[4]; // [4] = alpha channel, currently not used
  198. int iLineSize[4]; // [4] = alpha channel, currently not used
  199. };
  200. + struct {
  201. + CUDA::CCuda* cuda;
  202. + };
  203. struct {
  204. DXVA::CProcessor* proc;
  205. int64_t proc_id;
  206. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.cpp
  207. new file mode 100644
  208. index 0000000..cc9ec35
  209. --- /dev/null
  210. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.cpp
  211. @@ -0,0 +1,1256 @@
  212. +/*
  213. + * Copyright (C) 2005-2009 Team XBMC
  214. + * http://www.xbmc.org
  215. + *
  216. + * This Program is free software; you can redistribute it and/or modify
  217. + * it under the terms of the GNU General Public License as published by
  218. + * the Free Software Foundation; either version 2, or (at your option)
  219. + * any later version.
  220. + *
  221. + * This Program is distributed in the hope that it will be useful,
  222. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  223. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  224. + * GNU General Public License for more details.
  225. + *
  226. + * You should have received a copy of the GNU General Public License
  227. + * along with XBMC; see the file COPYING. If not, write to
  228. + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  229. + * http://www.gnu.org/copyleft/gpl.html
  230. + *
  231. + */
  232. +
  233. +#ifdef HAS_DX
  234. +
  235. +#if (defined HAVE_CONFIG_H) && (!defined WIN32)
  236. + #include "config.h"
  237. +#elif defined(_WIN32)
  238. +#include "system.h"
  239. +#endif
  240. +
  241. +// setting that here because otherwise SampleFormat is defined to AVSampleFormat
  242. +// which we don't use here
  243. +#define FF_API_OLD_SAMPLE_FMT 0
  244. +
  245. +#define RINT(x) ((x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5)))
  246. +
  247. +#include <windows.h>
  248. +#include "CUDA.h"
  249. +#include "../../../../windowing/WindowingFactory.h"
  250. +#include "DVDStreamInfo.h"
  251. +#include "Cuda/MPEG2HeaderParser.h"
  252. +#include "Cuda/H264SequenceParser.h"
  253. +#include "Cuda/VC1HeaderParser.h"
  254. +#include "utils/SystemInfo.h"
  255. +#include "DllAvCodec.h"
  256. +
  257. +using namespace CUDA;
  258. +
  259. +static struct {
  260. + CodecID ffcodec;
  261. + cudaVideoCodec cudaCodec;
  262. +} cuda_codecs[] = {
  263. + { CODEC_ID_MPEG1VIDEO, cudaVideoCodec_MPEG1 },
  264. + { CODEC_ID_MPEG2VIDEO, cudaVideoCodec_MPEG2 },
  265. + { CODEC_ID_VC1, cudaVideoCodec_VC1 },
  266. + { CODEC_ID_H264, cudaVideoCodec_H264 },
  267. + { CODEC_ID_MPEG4, cudaVideoCodec_MPEG4 },
  268. +};
  269. +
  270. +////////////////////////////////////////////////////////////////////////////////
  271. +// Compatibility tables
  272. +////////////////////////////////////////////////////////////////////////////////
  273. +
  274. +#define LEVEL_C_LOW_LIMIT 0x0A20
  275. +
  276. +static DWORD LevelCBlacklist[] = {
  277. + 0x0A22, 0x0A67, // Geforce 315, no VDPAU at all
  278. + 0x0A68, 0x0A69, // Geforce G105M, only B
  279. + 0x0CA0, 0x0CA7, // Geforce GT 330, only A
  280. + 0x0CAC, // Geforce GT 220, no VDPAU
  281. + 0x10C3 // Geforce 8400GS, only A
  282. +};
  283. +
  284. +static DWORD LevelCWhitelist[] = {
  285. + 0x06C0, // Geforce GTX 480
  286. + 0x06C4, // Geforce GTX 465
  287. + 0x06CA, // Geforce GTX 480M
  288. + 0x06CD, // Geforce GTX 470
  289. + 0x08A5, // Geforce 320M
  290. +
  291. + 0x06D8, 0x06DC, // Quadro 6000
  292. + 0x06D9, // Quadro 5000
  293. + 0x06DA, // Quadro 5000M
  294. + 0x06DD, // Quadro 4000
  295. +
  296. + 0x06D1, // Tesla C2050 / C2070
  297. + 0x06D2, // Tesla M2070
  298. + 0x06DE, // Tesla T20 Processor
  299. + 0x06DF, // Tesla M2070-Q
  300. +};
  301. +
  302. +static BOOL IsLevelC(DWORD deviceId)
  303. +{
  304. + int idx = 0;
  305. + if (deviceId >= LEVEL_C_LOW_LIMIT) {
  306. + for(idx = 0; idx < sizeof(LevelCBlacklist); idx++) {
  307. + if (LevelCBlacklist[idx] == deviceId)
  308. + return FALSE;
  309. + }
  310. + return TRUE;
  311. + } else {
  312. + for(idx = 0; idx < sizeof(LevelCWhitelist); idx++) {
  313. + if (LevelCWhitelist[idx] == deviceId)
  314. + return TRUE;
  315. + }
  316. + return FALSE;
  317. + }
  318. +}
  319. +
  320. +
  321. +CDVDVideoCodecCuda::CDVDVideoCodecCuda() : CDVDVideoCodec()
  322. +{
  323. + ZeroMemory(&cuda, sizeof(cuda));
  324. + ZeroMemory(&m_VideoFormat, sizeof(m_VideoFormat));
  325. + ZeroMemory(&m_DXVAExtendedFormat, sizeof(m_DXVAExtendedFormat));
  326. + m_AccelDeintOutput = 0;
  327. + m_DeintTreatAsProgressive = 0;
  328. + m_DeintAggressive = 0;
  329. + m_bVDPAULevelC = FALSE;
  330. + m_cudaContext = 0;
  331. + m_cudaCtxLock = 0;
  332. + m_hParser = 0;
  333. + m_hDecoder = 0;
  334. + m_hStream = 0;
  335. + m_bForceSequenceUpdate = FALSE;
  336. + m_bInterlaced =FALSE;
  337. + m_bFlushing =FALSE;
  338. + m_pbRawNV12 = NULL;
  339. + m_cRawNV12 = 0;
  340. + m_AVC1Converter = NULL;
  341. + m_dllAvUtil.Load();
  342. +}
  343. +
  344. +CDVDVideoCodecCuda::~CDVDVideoCodecCuda()
  345. +{
  346. + Dispose();
  347. +}
  348. +
  349. +bool CDVDVideoCodecCuda::DestroyDecoder(bool bFull)
  350. +{
  351. + if (m_AVC1Converter) {
  352. + SAFE_DELETE(m_AVC1Converter);
  353. + }
  354. +
  355. + if (m_hDecoder) {
  356. + cuda.cuvidDestroyDecoder(m_hDecoder);
  357. + m_hDecoder = 0;
  358. + }
  359. +
  360. + if (m_hParser) {
  361. + cuda.cuvidDestroyVideoParser(m_hParser);
  362. + m_hParser = 0;
  363. + }
  364. +
  365. + if (m_hStream) {
  366. + cuda.cuStreamDestroy(m_hStream);
  367. + m_hStream = 0;
  368. + }
  369. +
  370. + if (m_pbRawNV12) {
  371. + cuda.cuMemFreeHost(m_pbRawNV12);
  372. + m_pbRawNV12 = NULL;
  373. + m_cRawNV12 = 0;
  374. + }
  375. +
  376. + if(bFull) {
  377. + if (m_cudaCtxLock) {
  378. + cuda.cuvidCtxLockDestroy(m_cudaCtxLock);
  379. + m_cudaCtxLock = 0;
  380. + }
  381. +
  382. + if (m_cudaContext) {
  383. + cuda.cuCtxDestroy(m_cudaContext);
  384. + m_cudaContext = 0;
  385. + }
  386. +
  387. + FreeLibrary(cuda.cudaLib);
  388. + FreeLibrary(cuda.cuvidLib);
  389. + }
  390. +
  391. + return true;
  392. +}
  393. +
  394. +#define GET_PROC_EX(name, lib) \
  395. + cuda.name = (t##name *)GetProcAddress(lib, #name); \
  396. + if (cuda.name == NULL) { \
  397. + CLog::Log(LOGERROR,"Failed to load function \"%s\"", TEXT(#name)); \
  398. + return E_FAIL; \
  399. + }
  400. +
  401. +#define GET_PROC_CUDA(name) GET_PROC_EX(name, cuda.cudaLib)
  402. +#define GET_PROC_CUVID(name) GET_PROC_EX(name, cuda.cuvidLib)
  403. +
  404. +
  405. +bool CDVDVideoCodecCuda::LoadCUDAFuncRefs()
  406. +{
  407. + // Load CUDA functions
  408. + cuda.cudaLib = LoadLibrary("nvcuda.dll");
  409. + if (cuda.cudaLib == NULL)
  410. + {
  411. + CLog::Log(LOGERROR,"Loading nvcuda.dll failed");
  412. + return false;
  413. + }
  414. +
  415. + GET_PROC_CUDA(cuInit);
  416. + GET_PROC_CUDA(cuCtxCreate);
  417. + GET_PROC_CUDA(cuCtxDestroy);
  418. + GET_PROC_CUDA(cuCtxPushCurrent);
  419. + GET_PROC_CUDA(cuCtxPopCurrent);
  420. + GET_PROC_CUDA(cuD3D9CtxCreate);
  421. + GET_PROC_CUDA(cuMemAllocHost);
  422. + GET_PROC_CUDA(cuMemFreeHost);
  423. + GET_PROC_CUDA(cuMemcpyDtoH);
  424. + GET_PROC_CUDA(cuMemcpyDtoHAsync);
  425. + GET_PROC_CUDA(cuStreamCreate);
  426. + GET_PROC_CUDA(cuStreamDestroy);
  427. + GET_PROC_CUDA(cuStreamQuery);
  428. + GET_PROC_CUDA(cuDeviceGetCount);
  429. + GET_PROC_CUDA(cuDriverGetVersion);
  430. + GET_PROC_CUDA(cuDeviceGetName);
  431. + GET_PROC_CUDA(cuDeviceComputeCapability);
  432. + GET_PROC_CUDA(cuDeviceGetAttribute);
  433. +
  434. + // Load CUVID function
  435. + cuda.cuvidLib = LoadLibrary("nvcuvid.dll");
  436. + if (cuda.cuvidLib == NULL)
  437. + {
  438. + CLog::Log(LOGERROR,"Loading nvcuvid.dll failed");
  439. + return false;
  440. + }
  441. +
  442. + GET_PROC_CUVID(cuvidCtxLockCreate);
  443. + GET_PROC_CUVID(cuvidCtxLockDestroy);
  444. + GET_PROC_CUVID(cuvidCtxLock);
  445. + GET_PROC_CUVID(cuvidCtxUnlock);
  446. + GET_PROC_CUVID(cuvidCreateVideoParser);
  447. + GET_PROC_CUVID(cuvidParseVideoData);
  448. + GET_PROC_CUVID(cuvidDestroyVideoParser);
  449. + GET_PROC_CUVID(cuvidCreateDecoder);
  450. + GET_PROC_CUVID(cuvidDecodePicture);
  451. + GET_PROC_CUVID(cuvidDestroyDecoder);
  452. + GET_PROC_CUVID(cuvidMapVideoFrame);
  453. + GET_PROC_CUVID(cuvidUnmapVideoFrame);
  454. +
  455. + return true;
  456. +}
  457. +
  458. +// Beginning of GPU Architecture definitions
  459. +static int _ConvertSMVer2CoresDrvApi(int major, int minor)
  460. +{
  461. + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
  462. + typedef struct {
  463. + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
  464. + int Cores;
  465. + } sSMtoCores;
  466. +
  467. + sSMtoCores nGpuArchCoresPerSM[] =
  468. + {
  469. + { 0x10, 8 },
  470. + { 0x11, 8 },
  471. + { 0x12, 8 },
  472. + { 0x13, 8 },
  473. + { 0x20, 32 },
  474. + { 0x21, 48 },
  475. + { 0x30, 192 },
  476. + { -1, -1 }
  477. + };
  478. +
  479. + int index = 0;
  480. + while (nGpuArchCoresPerSM[index].SM != -1) {
  481. + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
  482. + return nGpuArchCoresPerSM[index].Cores;
  483. + }
  484. + index++;
  485. + }
  486. + printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
  487. + return -1;
  488. +}
  489. +
  490. +int CDVDVideoCodecCuda::GetMaxGflopsGraphicsDeviceId()
  491. +{
  492. + CUdevice current_device = 0, max_perf_device = 0;
  493. + int device_count = 0, sm_per_multiproc = 0;
  494. + int max_compute_perf = 0, best_SM_arch = 0;
  495. + int major = 0, minor = 0, multiProcessorCount, clockRate;
  496. + int bTCC = 0, version;
  497. + char deviceName[256];
  498. +
  499. + cuda.cuDeviceGetCount(&device_count);
  500. + if (device_count <= 0)
  501. + return -1;
  502. +
  503. + cuda.cuDriverGetVersion(&version);
  504. +
  505. + // Find the best major SM Architecture GPU device that are graphics devices
  506. + while ( current_device < device_count ) {
  507. + cuda.cuDeviceGetName(deviceName, 256, current_device);
  508. + cuda.cuDeviceComputeCapability(&major, &minor, current_device);
  509. +
  510. + if (version >= 3020) {
  511. + cuda.cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device);
  512. + } else {
  513. + // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
  514. + if (deviceName[0] == 'T') bTCC = 1;
  515. + }
  516. + if (!bTCC) {
  517. + if (major > 0 && major < 9999) {
  518. + best_SM_arch = std::max(best_SM_arch, major);
  519. + }
  520. + }
  521. + current_device++;
  522. + }
  523. +
  524. + // Find the best CUDA capable GPU device
  525. + current_device = 0;
  526. + while( current_device < device_count ) {
  527. + cuda.cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device);
  528. + cuda.cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device);
  529. + cuda.cuDeviceComputeCapability(&major, &minor, current_device);
  530. +
  531. + if (version >= 3020) {
  532. + cuda.cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device);
  533. + } else {
  534. + // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
  535. + if (deviceName[0] == 'T') bTCC = 1;
  536. + }
  537. +
  538. + if (major == 9999 && minor == 9999) {
  539. + sm_per_multiproc = 1;
  540. + } else {
  541. + sm_per_multiproc = _ConvertSMVer2CoresDrvApi(major, minor);
  542. + }
  543. +
  544. + // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
  545. + if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
  546. + {
  547. + int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
  548. + if(compute_perf > max_compute_perf) {
  549. + // If we find GPU with SM major > 2, search only these
  550. + if (best_SM_arch > 2) {
  551. + // If our device = dest_SM_arch, then we pick this one
  552. + if (major == best_SM_arch) {
  553. + max_compute_perf = compute_perf;
  554. + max_perf_device = current_device;
  555. + }
  556. + } else {
  557. + max_compute_perf = compute_perf;
  558. + max_perf_device = current_device;
  559. + }
  560. + }
  561. +
  562. +#ifdef _DEBUG
  563. + cuda.cuDeviceGetName(deviceName, 256, current_device);
  564. + CLog::Log(LOGINFO,"CUDA Device: %S, Compute: %d.%d, CUDA Cores: %d, Clock: %d MHz", deviceName, major, minor, multiProcessorCount * sm_per_multiproc, clockRate / 1000);
  565. +#endif
  566. + }
  567. + ++current_device;
  568. + }
  569. + return max_perf_device;
  570. +}
  571. +
  572. +bool CDVDVideoCodecCuda::Init()
  573. +{
  574. + CLog::Log(LOGINFO,"%s: Trying to open CUVID device",__FUNCTION__);
  575. +
  576. + CUresult cuStatus = CUDA_SUCCESS;
  577. +
  578. + if (!LoadCUDAFuncRefs())
  579. + {
  580. + CLog::Log(LOGERROR,"Loading CUDA interfaces failed");
  581. + return false;
  582. + }
  583. +
  584. + cuStatus = cuda.cuInit(0);
  585. + if (cuStatus != CUDA_SUCCESS)
  586. + {
  587. + CLog::Log(LOGERROR,"cuInit failed (status: %d)", cuStatus);
  588. + return false;
  589. + }
  590. +
  591. + // TODO: select best device
  592. + int best_device = GetMaxGflopsGraphicsDeviceId();
  593. + int device = best_device;
  594. +
  595. + HWND hwnd = g_Windowing.GetHwnd();
  596. +
  597. + D3DADAPTER_IDENTIFIER9 d3dId;
  598. + D3DPRESENT_PARAMETERS d3dpp;
  599. + D3DDISPLAYMODE d3ddm;
  600. + HRESULT hr = g_Windowing.Get3DDevice()->GetDirect3D(&m_pD3D);
  601. +
  602. + unsigned uAdapterCount = m_pD3D->GetAdapterCount();
  603. + for (unsigned lAdapter=0; lAdapter<uAdapterCount; lAdapter++) {
  604. + CLog::Log(LOGINFO,"Trying D3D Adapter %d..", lAdapter);
  605. +
  606. + ZeroMemory(&d3dpp, sizeof(d3dpp));
  607. + m_pD3D->GetAdapterDisplayMode(lAdapter, &d3ddm);
  608. +
  609. + d3dpp.Windowed = TRUE;
  610. + d3dpp.BackBufferWidth = 640;
  611. + d3dpp.BackBufferHeight = 480;
  612. + d3dpp.BackBufferCount = 1;
  613. + d3dpp.BackBufferFormat = d3ddm.Format;
  614. + d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
  615. + d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
  616. +
  617. + IDirect3DDevice9 *pDev = g_Windowing.Get3DDevice();
  618. + CUcontext cudaCtx = 0;
  619. + //why the fpu preserve ??
  620. + //hr = m_pD3D->CreateDevice(lAdapter, D3DDEVTYPE_HAL, hwnd, D3DCREATE_HARDWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED | D3DCREATE_FPU_PRESERVE, &d3dpp, &pDev);
  621. +
  622. + if (1)
  623. + {
  624. + m_pD3D->GetAdapterIdentifier(lAdapter, 0, &d3dId);
  625. + cuStatus = cuda.cuD3D9CtxCreate(&cudaCtx, &device, CU_CTX_SCHED_BLOCKING_SYNC, pDev);
  626. + if (cuStatus == CUDA_SUCCESS) {
  627. + CLog::Log(LOGINFO, "-> Created D3D Device on adapter %S (%d), using CUDA device %d", d3dId.Description, lAdapter, device);
  628. +
  629. + BOOL isLevelC = IsLevelC(d3dId.DeviceId);
  630. + CLog::Log(LOGINFO,"InitCUDA(): D3D Device with Id 0x%x is level C: %d", d3dId.DeviceId, isLevelC);
  631. +
  632. + if (m_bVDPAULevelC && !isLevelC) {
  633. + CLog::Log(LOGINFO, "InitCUDA(): We already had a Level C+ device, this one is not, skipping");
  634. + continue;
  635. + }
  636. +
  637. + if (m_cudaContext)
  638. + cuda.cuCtxDestroy(m_cudaContext);
  639. +
  640. + // Store resources
  641. + m_cudaContext = cudaCtx;
  642. + m_bVDPAULevelC = isLevelC;
  643. + // Is this the one we want?
  644. + if (device == best_device)
  645. + break;
  646. + }
  647. + else
  648. + CLog::Log(LOGINFO,"D3D Device on adapter %d is not CUDA capable", lAdapter);
  649. + }
  650. + }
  651. +
  652. + cuStatus = CUDA_SUCCESS;
  653. +
  654. + /*if (!m_pD3DDevice) {
  655. + DbgLog((LOG_TRACE, 10, L"-> No D3D device available, building non-D3D context on device %d", best_device));
  656. + SafeRelease(&m_pD3D);
  657. + cuStatus = cuda.cuCtxCreate(&m_cudaContext, CU_CTX_SCHED_BLOCKING_SYNC, best_device);
  658. +
  659. + int major, minor;
  660. + cuda.cuDeviceComputeCapability(&major, &minor, best_device);
  661. + m_bVDPAULevelC = (major >= 2);
  662. + DbgLog((LOG_TRACE, 10, L"InitCUDA(): pure CUDA context of device with compute %d.%d", major, minor));
  663. + }*/
  664. +
  665. + if (cuStatus == CUDA_SUCCESS) {
  666. + // Switch to a floating context
  667. + CUcontext curr_ctx = NULL;
  668. + cuStatus = cuda.cuCtxPopCurrent(&curr_ctx);
  669. + if (cuStatus != CUDA_SUCCESS)
  670. + {
  671. + CLog::Log(LOGERROR, "Storing context on the stack failed with error %d", cuStatus);
  672. + return false;
  673. + }
  674. + cuStatus = cuda.cuvidCtxLockCreate(&m_cudaCtxLock, m_cudaContext);
  675. + if (cuStatus != CUDA_SUCCESS) {
  676. + CLog::Log(LOGERROR, "Creation of floating context failed with error %d", cuStatus);
  677. + return false;
  678. + }
  679. + }
  680. + else
  681. + {
  682. + CLog::Log(LOGERROR, "Creation of CUDA context failed with error %d", cuStatus);
  683. + return false;
  684. + }
  685. +
  686. + return true;
  687. +
  688. +}
  689. +
  690. +DWORD avc_quant(BYTE *src, BYTE *dst, int extralen)
  691. +{
  692. + DWORD cb = 0;
  693. + BYTE* src_end = (BYTE *) src + extralen;
  694. + BYTE* dst_end = (BYTE *) dst + extralen;
  695. + src += 5;
  696. + // Two runs, for sps and pps
  697. + for (int i = 0; i < 2; i++)
  698. + {
  699. + for (int n = *(src++) & 0x1f; n > 0; n--)
  700. + {
  701. + unsigned len = (((unsigned)src[0] << 8) | src[1]) + 2;
  702. + if(src + len > src_end || dst + len > dst_end) { ASSERT(0); break; }
  703. + memcpy(dst, src, len);
  704. + src += len;
  705. + dst += len;
  706. + cb += len;
  707. + }
  708. + }
  709. + return cb;
  710. +}
  711. +
  712. +
  713. +bool CDVDVideoCodecCuda::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
  714. +{
  715. + if (!Init())
  716. + return false;
  717. + if (hints.codec == CODEC_ID_MPEG4)
  718. + m_pFormatName.Format("cuda_mpeg4");
  719. + else if (hints.codec == CODEC_ID_MPEG2VIDEO)
  720. + m_pFormatName.Format("cuda_mpeg2");
  721. + else if (hints.codec == CODEC_ID_H264)
  722. + m_pFormatName.Format("cuda_h264");
  723. + else if (hints.codec == CODEC_ID_VC1 || hints.codec == CODEC_ID_WMV3)
  724. + m_pFormatName.Format("cuda_vc1");
  725. + CLog::Log(LOGINFO, "CDecCuvid::InitDecoder(): Initializing CUVID decoder");
  726. + HRESULT hr = S_OK;
  727. +
  728. + if (!m_cudaContext) {
  729. + CLog::Log(LOGERROR, " InitDecoder called without a cuda context");
  730. + return false;
  731. + }
  732. +
  733. +
  734. + // Free old device
  735. + DestroyDecoder(false);
  736. +
  737. + // Flush Display Queue
  738. + memset(&m_DisplayQueue, 0, sizeof(m_DisplayQueue));
  739. + for (int i=0; i<DISPLAY_DELAY; i++)
  740. + m_DisplayQueue[i].picture_index = -1;
  741. + m_DisplayPos = 0;
  742. +
  743. + cudaVideoCodec cudaCodec = (cudaVideoCodec)-1;
  744. + for (int i = 0; i < countof(cuda_codecs); i++) {
  745. + if (cuda_codecs[i].ffcodec == hints.codec) {
  746. + cudaCodec = cuda_codecs[i].cudaCodec;
  747. + break;
  748. + }
  749. + }
  750. +
  751. + if (cudaCodec == -1) {
  752. + CLog::Log(LOGERROR, "Codec id %d does not map to a CUVID codec", hints.codec);
  753. + return false;
  754. + }
  755. +
  756. + if (cudaCodec == cudaVideoCodec_MPEG4 && !m_bVDPAULevelC) {
  757. + CLog::Log(LOGERROR, "Device is not capable to decode this format (not >= Level C)");
  758. + return false;
  759. + }
  760. +
  761. + /*m_bUseTimestampQueue = (cudaCodec == cudaVideoCodec_H264 && m_pCallback->H264IsAVI())
  762. + || (cudaCodec == cudaVideoCodec_MPEG4 && pmt->formattype != FORMAT_MPEG2Video)
  763. + || (cudaCodec == cudaVideoCodec_VC1 && m_pCallback->VC1IsDTS());*/
  764. + m_bUseTimestampQueue = (CODEC_ID_MPEG4 == hints.codec);
  765. + m_bWaitForKeyframe = m_bUseTimestampQueue;
  766. + m_bInterlaced = TRUE;
  767. + m_bFormatIncompatible = FALSE;
  768. + m_bTFF = TRUE;
  769. + m_rtPrevDiff = AV_NOPTS_VALUE;
  770. + m_bARPresent = TRUE;
  771. +
  772. + // Create the CUDA Video Parser
  773. + CUVIDPARSERPARAMS oVideoParserParameters;
  774. + ZeroMemory(&oVideoParserParameters, sizeof(CUVIDPARSERPARAMS));
  775. + oVideoParserParameters.CodecType = cudaCodec;
  776. + oVideoParserParameters.ulMaxNumDecodeSurfaces = MAX_DECODE_FRAMES;
  777. + oVideoParserParameters.ulMaxDisplayDelay = DISPLAY_DELAY;
  778. + oVideoParserParameters.pUserData = this;
  779. + oVideoParserParameters.pfnSequenceCallback = CDVDVideoCodecCuda::HandleVideoSequence; // Called before decoding frames and/or whenever there is a format change
  780. + oVideoParserParameters.pfnDecodePicture = CDVDVideoCodecCuda::HandlePictureDecode; // Called when a picture is ready to be decoded (decode order)
  781. + oVideoParserParameters.pfnDisplayPicture = CDVDVideoCodecCuda::HandlePictureDisplay; // Called whenever a picture is ready to be displayed (display order)
  782. + oVideoParserParameters.ulErrorThreshold = m_bUseTimestampQueue ? 100 : 0;
  783. +
  784. + memset(&m_VideoParserExInfo, 0, sizeof(CUVIDEOFORMATEX));
  785. + //TODO
  786. + //pmt->formattype == FORMAT_MPEG2Video && (pmt->subtype == MEDIASUBTYPE_AVC1 || pmt->subtype == MEDIASUBTYPE_avc1 || pmt->subtype == MEDIASUBTYPE_CCV1)) {
  787. + if (hints.codec != CODEC_ID_MPEG4)
  788. + {
  789. + //MPEG2VIDEOINFO *mp2vi = (MPEG2VIDEOINFO *)pmt->Format();
  790. + m_AVC1Converter = new CAVC1AnnexBConverter();
  791. + m_AVC1Converter->SetNALUSize(2);
  792. + BYTE* extradata = (BYTE*)hints.extradata;
  793. +
  794. + int nalusize = (extradata[4] & 3) + 1;
  795. +
  796. + BYTE *annexBextra = NULL;
  797. + int size = 0;
  798. + BYTE* dwSequenceHeader;
  799. + dwSequenceHeader = (byte*)malloc(hints.extrasize);
  800. + int cbSequenceHeader;
  801. + cbSequenceHeader = avc_quant(extradata, (BYTE *)(dwSequenceHeader), hints.extrasize);
  802. + //m_AVC1Converter->Convert(&annexBextra, &size, (BYTE *)mp2vi->dwSequenceHeader, mp2vi->cbSequenceHeader);
  803. + m_AVC1Converter->Convert(&annexBextra, &size, (BYTE *)dwSequenceHeader, cbSequenceHeader);
  804. + if (annexBextra && size) {
  805. + memcpy(m_VideoParserExInfo.raw_seqhdr_data, annexBextra, size);
  806. + m_VideoParserExInfo.format.seqhdr_data_length = size;
  807. + m_dllAvUtil.av_freep(&annexBextra);
  808. + }
  809. +
  810. + //m_AVC1Converter->SetNALUSize(smp2vi->dwFlags);
  811. + m_AVC1Converter->SetNALUSize(4);
  812. + } else {
  813. + size_t hdr_len = 0;
  814. + memcpy( m_VideoParserExInfo.raw_seqhdr_data, hints.extradata,hints.extrasize);
  815. + //getExtraData(*pmt, m_VideoParserExInfo.raw_seqhdr_data, &hdr_len);
  816. + m_VideoParserExInfo.format.seqhdr_data_length = (unsigned int)hints.extrasize;
  817. + }
  818. +
  819. + m_bNeedSequenceCheck = FALSE;
  820. + if (m_VideoParserExInfo.format.seqhdr_data_length) {
  821. + if (cudaCodec == cudaVideoCodec_H264) {
  822. + hr = CheckH264Sequence(m_VideoParserExInfo.raw_seqhdr_data, m_VideoParserExInfo.format.seqhdr_data_length);
  823. + if (FAILED(hr)) {
  824. + return false;
  825. + } else if (hr == S_FALSE) {
  826. + m_bNeedSequenceCheck = TRUE;
  827. + }
  828. + } else if (cudaCodec == cudaVideoCodec_MPEG2) {
  829. + CLog::Log(LOGINFO, "-> Scanning extradata for MPEG2 sequence header");
  830. + CMPEG2HeaderParser mpeg2parser(m_VideoParserExInfo.raw_seqhdr_data, m_VideoParserExInfo.format.seqhdr_data_length);
  831. + if (mpeg2parser.hdr.valid) {
  832. + if (mpeg2parser.hdr.chroma >= 2) {
  833. + CLog::Log(LOGERROR, "Sequence header indicates incompatible chroma sampling (chroma: %d)", mpeg2parser.hdr.chroma);
  834. + return false;
  835. + }
  836. + m_bInterlaced = mpeg2parser.hdr.interlaced;
  837. + }
  838. + } else if (cudaCodec == cudaVideoCodec_VC1) {
  839. + CVC1HeaderParser vc1Parser(m_VideoParserExInfo.raw_seqhdr_data, m_VideoParserExInfo.format.seqhdr_data_length);
  840. + m_bInterlaced = vc1Parser.hdr.interlaced;
  841. + }
  842. + } else {
  843. + m_bNeedSequenceCheck = (cudaCodec == cudaVideoCodec_H264);
  844. + }
  845. +
  846. + oVideoParserParameters.pExtVideoInfo = &m_VideoParserExInfo;
  847. + CUresult oResult = cuda.cuvidCreateVideoParser(&m_hParser, &oVideoParserParameters);
  848. + if (oResult != CUDA_SUCCESS) {
  849. + CLog::Log(LOGERROR, "Creating parser for type %d failed with code %d", cudaCodec, oResult);
  850. + return E_FAIL;
  851. + }
  852. +
  853. + {
  854. + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
  855. + oResult = cuda.cuStreamCreate(&m_hStream, 0);
  856. + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
  857. + if (oResult != CUDA_SUCCESS) {
  858. + CLog::Log(LOGERROR, "::InitCodec(): Creating stream failed");
  859. + return E_FAIL;
  860. + }
  861. + }
  862. +
  863. + //BITMAPINFOHEADER *bmi = NULL;
  864. + //videoFormatTypeHandler(pmt->Format(), pmt->FormatType(), &bmi);
  865. +
  866. + {
  867. + RECT rcDisplayArea = {0, 0, hints.width, hints.height};
  868. + hr = CreateCUVIDDecoder(cudaCodec, hints.width, hints.height, hints.width, hints.height, rcDisplayArea);
  869. + if (FAILED(hr)) {
  870. + CLog::Log(LOGERROR,"Creating CUVID decoder failed");
  871. + return false;
  872. + }
  873. + }
  874. +
  875. + m_bForceSequenceUpdate = TRUE;
  876. +
  877. + DecodeSequenceData();
  878. +
  879. +
  880. +
  881. + return true;
  882. +}
  883. +
  884. +bool CDVDVideoCodecCuda::CheckH264Sequence(const BYTE *buffer, int buflen)
  885. +{
  886. + CLog::Log(LOGINFO, "CDecCuvid::CheckH264Sequence(): Checking H264 frame for SPS");
  887. + CH264SequenceParser h264parser;
  888. + h264parser.ParseNALs(buffer, buflen, 0);
  889. + if (h264parser.sps.valid) {
  890. + m_bInterlaced = h264parser.sps.interlaced;
  891. + m_iFullRange = h264parser.sps.full_range;
  892. + m_bARPresent = h264parser.sps.ar_present;
  893. + CLog::Log(LOGINFO, "SPS found");
  894. + if (h264parser.sps.profile > 100 || h264parser.sps.chroma != 1 || h264parser.sps.luma_bitdepth != 8 || h264parser.sps.chroma_bitdepth != 8) {
  895. + CLog::Log(LOGERROR, "SPS indicates video incompatible with CUVID, aborting (profile: %d, chroma: %d, bitdepth: %d/%d)", h264parser.sps.profile, h264parser.sps.chroma, h264parser.sps.luma_bitdepth, h264parser.sps.chroma_bitdepth);
  896. + return false;
  897. + }
  898. + CLog::Log(LOGINFO, "Video seems compatible with CUVID");
  899. + return true;
  900. + }
  901. + return false;
  902. +}
  903. +
  904. +void fillDXVAExtFormat(DXVA2_ExtendedFormat &fmt, int range, int primaries, int matrix, int transfer)
  905. +{
  906. + fmt.value = 0;
  907. +
  908. + if (range != -1)
  909. + fmt.NominalRange = range ? DXVA2_NominalRange_0_255 : DXVA2_NominalRange_16_235;
  910. +
  911. + // Color Primaries
  912. + switch(primaries) {
  913. + case AVCOL_PRI_BT709:
  914. + fmt.VideoPrimaries = DXVA2_VideoPrimaries_BT709;
  915. + break;
  916. + case AVCOL_PRI_BT470M:
  917. + fmt.VideoPrimaries = DXVA2_VideoPrimaries_BT470_2_SysM;
  918. + break;
  919. + case AVCOL_PRI_BT470BG:
  920. + fmt.VideoPrimaries = DXVA2_VideoPrimaries_BT470_2_SysBG;
  921. + break;
  922. + case AVCOL_PRI_SMPTE170M:
  923. + fmt.VideoPrimaries = DXVA2_VideoPrimaries_SMPTE170M;
  924. + break;
  925. + case AVCOL_PRI_SMPTE240M:
  926. + fmt.VideoPrimaries = DXVA2_VideoPrimaries_SMPTE240M;
  927. + break;
  928. + }
  929. +
  930. + // Color Space / Transfer Matrix
  931. + switch (matrix) {
  932. + case AVCOL_SPC_BT709:
  933. + fmt.VideoTransferMatrix = DXVA2_VideoTransferMatrix_BT709;
  934. + break;
  935. + case AVCOL_SPC_FCC:
  936. + fmt.VideoTransferMatrix = (DXVA2_VideoTransferMatrix)6;
  937. + break;
  938. + case AVCOL_SPC_BT470BG:
  939. + case AVCOL_SPC_SMPTE170M:
  940. + fmt.VideoTransferMatrix = DXVA2_VideoTransferMatrix_BT601;
  941. + break;
  942. + case AVCOL_SPC_SMPTE240M:
  943. + fmt.VideoTransferMatrix = DXVA2_VideoTransferMatrix_SMPTE240M;
  944. + break;
  945. + case 8://AVCOL_SPC_YCGCO
  946. + fmt.VideoTransferMatrix = (DXVA2_VideoTransferMatrix)7;
  947. + break;
  948. + }
  949. +
  950. + // Color Transfer Function
  951. + switch(transfer) {
  952. + case AVCOL_TRC_BT709:
  953. + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_709;
  954. + break;
  955. + case AVCOL_TRC_GAMMA22:
  956. + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_22;
  957. + break;
  958. + case AVCOL_TRC_GAMMA28:
  959. + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_28;
  960. + break;
  961. + case AVCOL_SPC_SMPTE240M:
  962. + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_240M;
  963. + break;
  964. + }
  965. +}
  966. +
  967. +CUVIDPARSERDISPINFO* CDVDVideoCodecCuda::GetNextFrame()
  968. +{
  969. + int next = (m_DisplayPos + 1) % DISPLAY_DELAY;
  970. + return &m_DisplayQueue[next];
  971. +}
  972. +
  973. +int CUDAAPI CDVDVideoCodecCuda::HandleVideoSequence(void *obj, CUVIDEOFORMAT *cuvidfmt)
  974. +{
  975. + CLog::Log(LOGINFO, "%s: New Video Sequence",__FUNCTION__);
  976. + CDVDVideoCodecCuda *filter = static_cast<CDVDVideoCodecCuda *>(obj);
  977. +
  978. + CUVIDDECODECREATEINFO *dci = &filter->m_VideoDecoderInfo;
  979. +
  980. + if ((cuvidfmt->codec != dci->CodecType)
  981. + || (cuvidfmt->coded_width != dci->ulWidth)
  982. + || (cuvidfmt->coded_height != dci->ulHeight)
  983. + || (cuvidfmt->display_area.right != dci->ulTargetWidth)
  984. + || (cuvidfmt->display_area.bottom != dci->ulTargetHeight)
  985. + || (cuvidfmt->chroma_format != dci->ChromaFormat)
  986. + || filter->m_bForceSequenceUpdate)
  987. + {
  988. + filter->m_bForceSequenceUpdate = FALSE;
  989. + RECT rcDisplayArea = {cuvidfmt->display_area.left, cuvidfmt->display_area.top, cuvidfmt->display_area.right, cuvidfmt->display_area.bottom};
  990. + filter->CreateCUVIDDecoder(cuvidfmt->codec, cuvidfmt->coded_width, cuvidfmt->coded_height, cuvidfmt->display_area.right, cuvidfmt->display_area.bottom, rcDisplayArea);
  991. + }
  992. +
  993. + filter->m_bInterlaced = !cuvidfmt->progressive_sequence;
  994. + filter->m_bDoubleRateDeint = FALSE;
  995. + if (filter->m_bInterlaced && cuvidfmt->frame_rate.numerator && cuvidfmt->frame_rate.denominator) {
  996. + double dFrameTime = 10000000.0 / ((double)cuvidfmt->frame_rate.numerator / cuvidfmt->frame_rate.denominator);
  997. + if (filter->m_AccelDeintOutput == 0/*DeintOutput_FramePerField*/ && filter->m_VideoDecoderInfo.DeinterlaceMode != cudaVideoDeinterlaceMode_Weave && !filter->m_DeintTreatAsProgressive && (int)(dFrameTime / 10000.0) != 41) {
  998. + filter->m_bDoubleRateDeint = TRUE;
  999. + dFrameTime /= 2.0;
  1000. + }
  1001. + if (cuvidfmt->codec != cudaVideoCodec_MPEG4)
  1002. + filter->m_rtAvgTimePerFrame = REFERENCE_TIME(dFrameTime + 0.5);
  1003. + else
  1004. + filter->m_rtAvgTimePerFrame = AV_NOPTS_VALUE; //TODO: base on media type
  1005. + } else {
  1006. + filter->m_rtAvgTimePerFrame = AV_NOPTS_VALUE;
  1007. + }
  1008. + filter->m_VideoFormat = *cuvidfmt;
  1009. +
  1010. + if (cuvidfmt->chroma_format != cudaVideoChromaFormat_420) {
  1011. + CLog::Log(LOGERROR, "CDecCuvid::HandleVideoSequence(): Incompatible Chroma Format detected");
  1012. + filter->m_bFormatIncompatible = TRUE;
  1013. + }
  1014. +
  1015. + fillDXVAExtFormat(filter->m_DXVAExtendedFormat, filter->m_iFullRange, cuvidfmt->video_signal_description.color_primaries, cuvidfmt->video_signal_description.matrix_coefficients, cuvidfmt->video_signal_description.transfer_characteristics);
  1016. +
  1017. + return TRUE;
  1018. +}
  1019. +
  1020. +int CUDAAPI CDVDVideoCodecCuda::HandlePictureDecode(void *obj, CUVIDPICPARAMS *cuvidpic)
  1021. +{
  1022. + CDVDVideoCodecCuda *filter = reinterpret_cast<CDVDVideoCodecCuda *>(obj);
  1023. +
  1024. + if (filter->m_bFlushing)
  1025. + return FALSE;
  1026. +
  1027. + if (filter->m_bWaitForKeyframe) {
  1028. + if (cuvidpic->intra_pic_flag)
  1029. + filter->m_bWaitForKeyframe = FALSE;
  1030. + else {
  1031. + // Pop timestamp from the queue, drop frame
  1032. + if (!filter->m_timestampQueue.empty()) {
  1033. + filter->m_timestampQueue.pop();
  1034. + }
  1035. + return FALSE;
  1036. + }
  1037. + }
  1038. +
  1039. + int flush_pos = filter->m_DisplayPos;
  1040. + for (;;) {
  1041. + bool frame_in_use = false;
  1042. + for (int i=0; i<DISPLAY_DELAY; i++) {
  1043. + if (filter->m_DisplayQueue[i].picture_index == cuvidpic->CurrPicIdx) {
  1044. + frame_in_use = true;
  1045. + break;
  1046. + }
  1047. + }
  1048. + if (!frame_in_use) {
  1049. + // No problem: we're safe to use this frame
  1050. + break;
  1051. + }
  1052. + // The target frame is still pending in the display queue:
  1053. + // Flush the oldest entry from the display queue and repeat
  1054. + if (filter->m_DisplayQueue[flush_pos].picture_index >= 0) {
  1055. + //TODO
  1056. + //filter->Display(&filter->m_DisplayQueue[flush_pos]);
  1057. + filter->m_DisplayQueue[flush_pos].picture_index = -1;
  1058. + }
  1059. + flush_pos = (flush_pos + 1) % DISPLAY_DELAY;
  1060. + }
  1061. +
  1062. + filter->cuda.cuvidCtxLock(filter->m_cudaCtxLock, 0);
  1063. + filter->m_PicParams[cuvidpic->CurrPicIdx] = *cuvidpic;
  1064. + __try {
  1065. + CUresult cuStatus = filter->cuda.cuvidDecodePicture(filter->m_hDecoder, cuvidpic);
  1066. + #ifdef _DEBUG
  1067. + if (cuStatus != CUDA_SUCCESS) {
  1068. + CLog::Log(LOGERROR, "CDVDVideoCodecCuda::HandlePictureDecode(): cuvidDecodePicture returned error code %d", cuStatus);
  1069. + }
  1070. + #endif
  1071. + } __except(1) {
  1072. + CLog::Log(LOGERROR, "CDVDVideoCodecCuda::HandlePictureDecode(): cuvidDecodePicture threw an exception");
  1073. + }
  1074. + filter->cuda.cuvidCtxUnlock(filter->m_cudaCtxLock, 0);
  1075. +
  1076. + return TRUE;
  1077. +}
  1078. +
  1079. +int CUDAAPI CDVDVideoCodecCuda::HandlePictureDisplay(void *obj, CUVIDPARSERDISPINFO *cuviddisp)
  1080. +{
  1081. + CDVDVideoCodecCuda *filter = reinterpret_cast<CDVDVideoCodecCuda *>(obj);
  1082. +
  1083. + if (filter->m_bFlushing)
  1084. + return FALSE;
  1085. +
  1086. + if (filter->m_bUseTimestampQueue) {
  1087. + if (filter->m_timestampQueue.empty()) {
  1088. + cuviddisp->timestamp = AV_NOPTS_VALUE;
  1089. + } else {
  1090. + cuviddisp->timestamp = filter->m_timestampQueue.front();
  1091. + filter->m_timestampQueue.pop();
  1092. + }
  1093. + }
  1094. +
  1095. + // Drop samples with negative timestamps (preroll) or during flushing
  1096. + if (cuviddisp->timestamp != AV_NOPTS_VALUE && cuviddisp->timestamp < 0)
  1097. + return TRUE;
  1098. +
  1099. + /*if (filter->m_DisplayQueue[filter->m_DisplayPos].picture_index >= 0) {
  1100. + filter->Display(&filter->m_DisplayQueue[filter->m_DisplayPos]);
  1101. + filter->m_DisplayQueue[filter->m_DisplayPos].picture_index = -1;
  1102. + }
  1103. + */filter->m_DisplayQueue[filter->m_DisplayPos] = *cuviddisp;
  1104. + /*filter->m_DisplayPos = (filter->m_DisplayPos + 1) % DISPLAY_DELAY;*/
  1105. +
  1106. + return TRUE;
  1107. +}
  1108. +
  1109. +bool CDVDVideoCodecCuda::CreateCUVIDDecoder(cudaVideoCodec codec, DWORD dwWidth, DWORD dwHeight, DWORD dwDisplayWidth, DWORD dwDisplayHeight, RECT rcDisplayArea)
  1110. +{
  1111. + //DbgLog((LOG_TRACE, 10, L"CDecCuvid::CreateCUVIDDecoder(): Creating CUVID decoder instance"));
  1112. + HRESULT hr = S_OK;
  1113. + BOOL bDXVAMode = (g_Windowing.Get3DDevice() && /*m_pSettings->GetHWAccelDeintHQ() &&*/ g_sysinfo.IsVistaOrHigher());
  1114. +
  1115. + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
  1116. + CUVIDDECODECREATEINFO *dci = &m_VideoDecoderInfo;
  1117. +
  1118. +retry:
  1119. + if (m_hDecoder) {
  1120. + cuda.cuvidDestroyDecoder(m_hDecoder);
  1121. + m_hDecoder = 0;
  1122. + }
  1123. + ZeroMemory(dci, sizeof(*dci));
  1124. + dci->ulWidth = dwWidth;
  1125. + dci->ulHeight = dwHeight;
  1126. + dci->ulNumDecodeSurfaces = MAX_DECODE_FRAMES;
  1127. + dci->CodecType = codec;
  1128. + dci->ChromaFormat = cudaVideoChromaFormat_420;
  1129. + dci->OutputFormat = cudaVideoSurfaceFormat_NV12;
  1130. + //TODO
  1131. + dci->DeinterlaceMode = (cudaVideoDeinterlaceMode)0;//m_pSettings->GetHWAccelDeintMode();
  1132. + dci->ulNumOutputSurfaces = 1;
  1133. +
  1134. + dci->ulTargetWidth = dwDisplayWidth;
  1135. + dci->ulTargetHeight = dwDisplayHeight;
  1136. +
  1137. + dci->display_area.left = (short)rcDisplayArea.left;
  1138. + dci->display_area.right = (short)rcDisplayArea.right;
  1139. + dci->display_area.top = (short)rcDisplayArea.top;
  1140. + dci->display_area.bottom = (short)rcDisplayArea.bottom;
  1141. +
  1142. + dci->ulCreationFlags = bDXVAMode ? cudaVideoCreate_PreferDXVA : cudaVideoCreate_PreferCUVID;
  1143. + dci->vidLock = m_cudaCtxLock;
  1144. +
  1145. + // create the decoder
  1146. + CUresult oResult = cuda.cuvidCreateDecoder(&m_hDecoder, dci);
  1147. + if (oResult != CUDA_SUCCESS) {
  1148. + //DbgLog((LOG_ERROR, 10, L"-> Creation of decoder for type %d failed with code %d", dci->CodecType, oResult));
  1149. + if (bDXVAMode) {
  1150. + //DbgLog((LOG_ERROR, 10, L" -> Retrying in pure CUVID mode"));
  1151. + bDXVAMode = FALSE;
  1152. + goto retry;
  1153. + }
  1154. + hr = E_FAIL;
  1155. + }
  1156. + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
  1157. +
  1158. + return SUCCEEDED(hr);
  1159. +}
  1160. +bool CDVDVideoCodecCuda::DecodeSequenceData()
  1161. +{
  1162. + CUresult oResult;
  1163. +
  1164. + CUVIDSOURCEDATAPACKET pCuvidPacket;
  1165. + ZeroMemory(&pCuvidPacket, sizeof(pCuvidPacket));
  1166. +
  1167. + pCuvidPacket.payload = m_VideoParserExInfo.raw_seqhdr_data;
  1168. + pCuvidPacket.payload_size = m_VideoParserExInfo.format.seqhdr_data_length;
  1169. +
  1170. + if (pCuvidPacket.payload && pCuvidPacket.payload_size) {
  1171. + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
  1172. + oResult = cuda.cuvidParseVideoData(m_hParser, &pCuvidPacket);
  1173. + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
  1174. + }
  1175. +
  1176. + return true;
  1177. +}
  1178. +
  1179. +bool CDVDVideoCodecCuda::Display(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture)
  1180. +{
  1181. + BOOL bTreatAsProgressive = m_DeintTreatAsProgressive;
  1182. +
  1183. + if (bTreatAsProgressive) {
  1184. + cuviddisp->progressive_frame = TRUE;
  1185. + m_nSoftTelecine = FALSE;
  1186. + } else {
  1187. + if (m_VideoFormat.codec == cudaVideoCodec_MPEG2 || m_VideoFormat.codec == cudaVideoCodec_H264) {
  1188. + if (cuviddisp->repeat_first_field) {
  1189. + m_nSoftTelecine = 2;
  1190. + } else if (m_nSoftTelecine) {
  1191. + m_nSoftTelecine--;
  1192. + }
  1193. + if (!m_nSoftTelecine)
  1194. + m_bTFF = cuviddisp->top_field_first;
  1195. + }
  1196. +
  1197. + cuviddisp->progressive_frame = (cuviddisp->progressive_frame && !(m_bInterlaced && m_DeintAggressive && m_VideoFormat.codec != cudaVideoCodec_VC1) && !m_DeintForce);
  1198. + }
  1199. +
  1200. + DeintFieldOrder fo = m_DeIntFieldOrder;
  1201. + cuviddisp->top_field_first = (fo == DeintFieldOrder_Auto) ? (m_nSoftTelecine ? m_bTFF : cuviddisp->top_field_first) : (fo == DeintFieldOrder_TopFieldFirst);
  1202. +
  1203. + if (m_bDoubleRateDeint) {
  1204. + if (cuviddisp->progressive_frame || m_nSoftTelecine) {
  1205. + Deliver(cuviddisp, pDvdVideoPicture, 2);
  1206. + } else {
  1207. + Deliver(cuviddisp, pDvdVideoPicture, 0);
  1208. + Deliver(cuviddisp, pDvdVideoPicture, 1);
  1209. + }
  1210. + } else {
  1211. + Deliver(cuviddisp, pDvdVideoPicture);
  1212. + }
  1213. + return S_OK;
  1214. +}
  1215. +
  1216. +bool CDVDVideoCodecCuda::Deliver(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture, int field)
  1217. +{
  1218. + CUdeviceptr devPtr = 0;
  1219. + unsigned int pitch = 0, width = 0, height = 0;
  1220. + CUVIDPROCPARAMS vpp;
  1221. + CUresult cuStatus = CUDA_SUCCESS;
  1222. +
  1223. + memset(&vpp, 0, sizeof(vpp));
  1224. + vpp.progressive_frame = !m_nSoftTelecine && cuviddisp->progressive_frame;
  1225. + vpp.top_field_first = cuviddisp->top_field_first;
  1226. + vpp.second_field = (field == 1);
  1227. +
  1228. + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
  1229. + cuStatus = cuda.cuvidMapVideoFrame(m_hDecoder, cuviddisp->picture_index, &devPtr, &pitch, &vpp);
  1230. + if (cuStatus != CUDA_SUCCESS) {
  1231. + CLog::Log(LOGERROR, "CDecCuvid::Deliver(): cuvidMapVideoFrame failed on index %d", cuviddisp->picture_index);
  1232. + goto cuda_fail;
  1233. + }
  1234. +
  1235. + width = m_VideoDecoderInfo.display_area.right;
  1236. + height = m_VideoDecoderInfo.display_area.bottom;
  1237. + int size = pitch * height * 3 / 2;
  1238. +
  1239. + if(!m_pbRawNV12 || size > m_cRawNV12) {
  1240. + if (m_pbRawNV12) {
  1241. + cuda.cuMemFreeHost(m_pbRawNV12);
  1242. + m_pbRawNV12 = NULL;
  1243. + m_cRawNV12 = 0;
  1244. + }
  1245. + cuStatus = cuda.cuMemAllocHost((void **)&m_pbRawNV12, size);
  1246. + if (cuStatus != CUDA_SUCCESS) {
  1247. + CLog::Log(LOGERROR, "CDecCuvid::Deliver(): cuMemAllocHost failed to allocate %d bytes (%d)", size, cuStatus);
  1248. + goto cuda_fail;
  1249. + }
  1250. + m_cRawNV12 = size;
  1251. + }
  1252. + // Copy memory from the device into the staging area
  1253. + if (m_pbRawNV12) {
  1254. +#if USE_ASYNC_COPY
  1255. + cuStatus = cuda.cuMemcpyDtoHAsync(m_pbRawNV12, devPtr, size, m_hStream);
  1256. + if (cuStatus != CUDA_SUCCESS) {
  1257. + CLog::Log(LOGERROR, "Async Memory Transfer failed (%d)", cuStatus);
  1258. + goto cuda_fail;
  1259. + }
  1260. + while (CUDA_ERROR_NOT_READY == cuda.cuStreamQuery(m_hStream)) {
  1261. + Sleep(1);
  1262. + }
  1263. +#else
  1264. + cuStatus = cuda.cuMemcpyDtoH(m_pbRawNV12, devPtr, size);
  1265. + if (cuStatus != CUDA_SUCCESS) {
  1266. + CLog::Log(LOGERROR, "Memory Transfer failed (%d)", cuStatus);
  1267. + goto cuda_fail;
  1268. + }
  1269. +#endif
  1270. + } else {
  1271. + // If we don't have our memory, this is bad.
  1272. + CLog::Log(LOGERROR, "No Valid Staging Memory - failing");
  1273. + goto cuda_fail;
  1274. + }
  1275. + cuda.cuvidUnmapVideoFrame(m_hDecoder, devPtr);
  1276. + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
  1277. +
  1278. +
  1279. + // Setup the LAVFrame
  1280. + //DVDVideoPicture* pFrame = NULL;
  1281. + //LAVFrame *pFrame = NULL;
  1282. + //AllocateFrame(&pFrame);
  1283. +
  1284. +
  1285. + if (m_rtAvgTimePerFrame != AV_NOPTS_VALUE) {
  1286. + pDvdVideoPicture->iDuration = m_rtAvgTimePerFrame;//supposed to be avg frame duration
  1287. + }
  1288. +
  1289. + REFERENCE_TIME rtStart = cuviddisp->timestamp, rtStop = AV_NOPTS_VALUE;
  1290. + if (rtStart != AV_NOPTS_VALUE) {
  1291. + CUVIDPARSERDISPINFO *next = GetNextFrame();
  1292. + if (next->picture_index != -1 && next->timestamp != AV_NOPTS_VALUE) {
  1293. + m_rtPrevDiff = next->timestamp - cuviddisp->timestamp;
  1294. + }
  1295. +
  1296. + if (m_rtPrevDiff != AV_NOPTS_VALUE) {
  1297. + REFERENCE_TIME rtHalfDiff = m_rtPrevDiff >> 1;
  1298. + if (field == 1)
  1299. + rtStart += rtHalfDiff;
  1300. +
  1301. + rtStop = rtStart + rtHalfDiff;
  1302. +
  1303. + if (field == 2 || !m_bDoubleRateDeint)
  1304. + rtStop += rtHalfDiff;
  1305. + }
  1306. +
  1307. + // Sanity check in case the duration is null
  1308. + if (rtStop <= rtStart)
  1309. + rtStop = AV_NOPTS_VALUE;
  1310. + }
  1311. +
  1312. + pDvdVideoPicture->format = DVDVideoPicture::FMT_NV12;
  1313. + pDvdVideoPicture->iWidth = width;
  1314. + pDvdVideoPicture->iHeight = height;
  1315. + pDvdVideoPicture->pts = rtStart;
  1316. + if (rtStop>0)
  1317. + pDvdVideoPicture->iDuration = rtStop - rtStart;
  1318. + pDvdVideoPicture->iRepeatPicture = cuviddisp->repeat_first_field;
  1319. + {
  1320. + AVRational ar = { m_VideoFormat.display_aspect_ratio.x, m_VideoFormat.display_aspect_ratio.y };
  1321. + AVRational arDim = { width, height };
  1322. + double aspect_ratio;
  1323. + if (m_bARPresent || av_cmp_q(ar, arDim) != 0) {
  1324. + if (ar.num == 0)
  1325. + aspect_ratio = 0;
  1326. + else
  1327. + aspect_ratio = av_q2d(ar) * pDvdVideoPicture->iWidth / pDvdVideoPicture->iHeight;
  1328. +
  1329. + if (aspect_ratio <= 0.0)
  1330. + aspect_ratio = (float)pDvdVideoPicture->iWidth / (float)pDvdVideoPicture->iHeight;
  1331. + pDvdVideoPicture->iDisplayHeight = pDvdVideoPicture->iHeight;
  1332. + pDvdVideoPicture->iDisplayWidth = ((int)RINT(pDvdVideoPicture->iHeight * aspect_ratio)) & -3;
  1333. + if (pDvdVideoPicture->iDisplayWidth > pDvdVideoPicture->iWidth)
  1334. + {
  1335. + pDvdVideoPicture->iDisplayWidth = pDvdVideoPicture->iWidth;
  1336. + pDvdVideoPicture->iDisplayHeight = ((int)RINT(pDvdVideoPicture->iWidth / aspect_ratio)) & -3;
  1337. + }
  1338. + //pFrame->aspect_ratio = ar;
  1339. + }
  1340. + }
  1341. + pDvdVideoPicture->iDisplayHeight = pDvdVideoPicture->iHeight;
  1342. + pDvdVideoPicture->iDisplayWidth = pDvdVideoPicture->iWidth;
  1343. + //pFrame->cuda //ext_format = m_DXVAExtendedFormat;
  1344. + bool interlaced = !cuviddisp->progressive_frame && m_VideoDecoderInfo.DeinterlaceMode == cudaVideoDeinterlaceMode_Weave;
  1345. + pDvdVideoPicture->iFlags |= interlaced ? DVP_FLAG_INTERLACED : 0;
  1346. + pDvdVideoPicture->iFlags |= cuviddisp->top_field_first ? DVP_FLAG_TOP_FIELD_FIRST: 0;
  1347. +
  1348. + // TODO: This may be wrong for H264 where B-Frames can be references
  1349. +
  1350. + pDvdVideoPicture->iFrameType = m_PicParams[cuviddisp->picture_index].intra_pic_flag ? 'I' : (m_PicParams[cuviddisp->picture_index].ref_pic_flag ? 'P' : 'B');
  1351. +
  1352. + // Assign the buffer to the LAV Frame bufers
  1353. + int Ysize = height * pitch;
  1354. + pDvdVideoPicture->data[0] = m_pbRawNV12;
  1355. + pDvdVideoPicture->data[1] = m_pbRawNV12+Ysize;
  1356. + pDvdVideoPicture->iLineSize[0] = pDvdVideoPicture->iLineSize[1] = pitch;
  1357. + //TODO
  1358. + //pFrame->stride[0] = pFrame->stride[1] = pitch;
  1359. + //TODO
  1360. + //m_pCallback->Deliver(pFrame);
  1361. +
  1362. + return true;
  1363. +
  1364. +cuda_fail:
  1365. + cuda.cuvidUnmapVideoFrame(m_hDecoder, devPtr);
  1366. + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
  1367. + return false;
  1368. +}
  1369. +
  1370. +
  1371. +void CDVDVideoCodecCuda::Dispose()
  1372. +{
  1373. +
  1374. +}
  1375. +
  1376. +void CDVDVideoCodecCuda::SetDropState(bool bDrop)
  1377. +{
  1378. +
  1379. +}
  1380. +
  1381. +int CDVDVideoCodecCuda::Decode(BYTE* pData, int iSize, double dts, double pts)
  1382. +{
  1383. + CUresult result;
  1384. + HRESULT hr;
  1385. +
  1386. + CUVIDSOURCEDATAPACKET pCuvidPacket;
  1387. + ZeroMemory(&pCuvidPacket, sizeof(pCuvidPacket));
  1388. +
  1389. + BYTE *pBuffer = NULL;
  1390. + if (m_AVC1Converter) {
  1391. + int size = 0;
  1392. + hr = m_AVC1Converter->Convert(&pBuffer, &size, pData, iSize);
  1393. + if (SUCCEEDED(hr)) {
  1394. + pCuvidPacket.payload = pBuffer;
  1395. + pCuvidPacket.payload_size = size;
  1396. + }
  1397. + } else {
  1398. + pCuvidPacket.payload = pData;
  1399. + pCuvidPacket.payload_size = iSize;
  1400. + }
  1401. +
  1402. + if (m_bNeedSequenceCheck && m_VideoDecoderInfo.CodecType == cudaVideoCodec_H264) {
  1403. + hr = CheckH264Sequence(pCuvidPacket.payload, pCuvidPacket.payload_size);
  1404. + if (FAILED(hr)) {
  1405. + m_bFormatIncompatible = TRUE;
  1406. + } else if (hr == S_OK) {
  1407. + m_bNeedSequenceCheck = FALSE;
  1408. + }
  1409. + }
  1410. +
  1411. + if (dts != AV_NOPTS_VALUE) {
  1412. + pCuvidPacket.flags |= CUVID_PKT_TIMESTAMP;
  1413. + pCuvidPacket.timestamp = dts;
  1414. + }
  1415. +
  1416. + //if (bDiscontinuity)
  1417. + // pCuvidPacket.flags |= CUVID_PKT_DISCONTINUITY;
  1418. +
  1419. + if (m_bUseTimestampQueue)
  1420. + m_timestampQueue.push(dts);
  1421. +
  1422. + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
  1423. + __try {
  1424. + result = cuda.cuvidParseVideoData(m_hParser, &pCuvidPacket);
  1425. + } __except(1) {
  1426. + CLog::Log(LOGERROR, "CDecCuvid::Decode(): cuvidParseVideoData threw an exception");
  1427. + }
  1428. + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
  1429. +
  1430. + m_dllAvUtil.av_freep(&pBuffer);
  1431. +
  1432. + if (m_bFormatIncompatible) {
  1433. + CLog::Log(LOGERROR, "CDecCuvid::Decode(): Incompatible format detected, indicating failure...");
  1434. + return VC_ERROR;
  1435. + }
  1436. +
  1437. + if (m_DisplayQueue[m_DisplayPos].picture_index >= 0)
  1438. + return VC_BUFFER | VC_PICTURE;
  1439. + return VC_BUFFER;
  1440. +}
  1441. +
  1442. +void CDVDVideoCodecCuda::Reset(void)
  1443. +{
  1444. +
  1445. +}
  1446. +
  1447. +bool CDVDVideoCodecCuda::GetPicture(DVDVideoPicture* pDvdVideoPicture)
  1448. +{
  1449. + if (m_DisplayQueue[m_DisplayPos].picture_index >= 0) {
  1450. + Display(&m_DisplayQueue[m_DisplayPos], pDvdVideoPicture);
  1451. + m_DisplayQueue[m_DisplayPos].picture_index = -1;
  1452. + }
  1453. + else
  1454. + return VC_BUFFER;
  1455. + //m_DisplayQueue[m_DisplayPos] = *cuviddisp;
  1456. + m_DisplayPos = (m_DisplayPos + 1) % DISPLAY_DELAY;
  1457. +
  1458. +
  1459. + return VC_PICTURE | VC_BUFFER;
  1460. +}
  1461. +
  1462. +
  1463. +
  1464. +
  1465. +
  1466. +
  1467. +#endif
  1468. \ No newline at end of file
  1469. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.h
  1470. new file mode 100644
  1471. index 0000000..245c916
  1472. --- /dev/null
  1473. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.h
  1474. @@ -0,0 +1,184 @@
  1475. +/*
  1476. + * Copyright (C) 2012 Team XBMC
  1477. + * http://www.xbmc.org
  1478. + *
  1479. + * This Program is free software; you can redistribute it and/or modify
  1480. + * it under the terms of the GNU General Public License as published by
  1481. + * the Free Software Foundation; either version 2, or (at your option)
  1482. + * any later version.
  1483. + *
  1484. + * This Program is distributed in the hope that it will be useful,
  1485. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1486. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1487. + * GNU General Public License for more details.
  1488. + *
  1489. + * You should have received a copy of the GNU General Public License
  1490. + * along with XBMC; see the file COPYING. If not, write to
  1491. + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  1492. + * http://www.gnu.org/copyleft/gpl.html
  1493. + *
  1494. + */
  1495. +#ifdef HAS_DX
  1496. +#pragma once
  1497. +#include "DVDCodecs/Video/DVDVideoCodecFFmpeg.h"
  1498. +#include "Cuda/AVC1AnnexBConverter.h"
  1499. +#include <dxva2api.h>
  1500. +
  1501. +#define MAX_DECODE_FRAMES 20
  1502. +#define DISPLAY_DELAY 4
  1503. +#define USE_ASYNC_COPY 1
  1504. +#define MAX_PIC_INDEX 64
  1505. +
  1506. +#define CUDA_FORCE_API_VERSION 3010
  1507. +#include "cuda/cuda.h"
  1508. +#include "cuda/nvcuvid.h"
  1509. +#include "cuda/cuda_dynlink.h"
  1510. +#include <queue>
  1511. +#include "dllavutil.h"
  1512. +
  1513. +#define CUMETHOD(name) t##name *##name
  1514. +#define countof( array ) ( sizeof( array )/sizeof( array[0] ) )
  1515. +
  1516. +namespace CUDA {
  1517. +
  1518. + // Control the field order of the deinterlacer
  1519. +typedef enum DeintFieldOrder {
  1520. + DeintFieldOrder_Auto,
  1521. + DeintFieldOrder_TopFieldFirst,
  1522. + DeintFieldOrder_BottomFieldFirst,
  1523. +};
  1524. +
  1525. +class CDVDVideoCodecCuda : public CDVDVideoCodec
  1526. +{
  1527. +public:
  1528. + CDVDVideoCodecCuda();
  1529. + virtual ~CDVDVideoCodecCuda();
  1530. +
  1531. + // Required overrides
  1532. + virtual bool Open(CDVDStreamInfo &hints, CDVDCodecOptions &options);
  1533. + virtual void Dispose(void);
  1534. + virtual int Decode(BYTE *pData, int iSize, double dts, double pts);
  1535. + virtual void Reset(void);
  1536. + virtual bool GetPicture(DVDVideoPicture *pDvdVideoPicture);
  1537. + virtual void SetDropState(bool bDrop);
  1538. + virtual const char* GetName(void) { return (const char*)m_pFormatName; }
  1539. +
  1540. +protected:
  1541. + bool DestroyDecoder(bool full);
  1542. + bool LoadCUDAFuncRefs();
  1543. + int GetMaxGflopsGraphicsDeviceId();
  1544. + bool Init();
  1545. +
  1546. + CStdString m_pFormatName;
  1547. + DVDVideoPicture m_videobuffer;
  1548. +private:
  1549. + struct {
  1550. + HMODULE cudaLib;
  1551. + CUMETHOD(cuInit);
  1552. + CUMETHOD(cuCtxCreate);
  1553. + CUMETHOD(cuCtxDestroy);
  1554. + CUMETHOD(cuCtxPushCurrent);
  1555. + CUMETHOD(cuCtxPopCurrent);
  1556. + CUMETHOD(cuD3D9CtxCreate);
  1557. + CUMETHOD(cuMemAllocHost);
  1558. + CUMETHOD(cuMemFreeHost);
  1559. + CUMETHOD(cuMemcpyDtoH);
  1560. + CUMETHOD(cuMemcpyDtoHAsync);
  1561. + CUMETHOD(cuStreamCreate);
  1562. + CUMETHOD(cuStreamDestroy);
  1563. + CUMETHOD(cuStreamQuery);
  1564. + CUMETHOD(cuDeviceGetCount);
  1565. + CUMETHOD(cuDriverGetVersion);
  1566. + CUMETHOD(cuDeviceGetName);
  1567. + CUMETHOD(cuDeviceComputeCapability);
  1568. + CUMETHOD(cuDeviceGetAttribute);
  1569. +
  1570. + HMODULE cuvidLib;
  1571. + CUMETHOD(cuvidCtxLockCreate);
  1572. + CUMETHOD(cuvidCtxLockDestroy);
  1573. + CUMETHOD(cuvidCtxLock);
  1574. + CUMETHOD(cuvidCtxUnlock);
  1575. + CUMETHOD(cuvidCreateVideoParser);
  1576. + CUMETHOD(cuvidParseVideoData);
  1577. + CUMETHOD(cuvidDestroyVideoParser);
  1578. + CUMETHOD(cuvidCreateDecoder);
  1579. + CUMETHOD(cuvidDecodePicture);
  1580. + CUMETHOD(cuvidDestroyDecoder);
  1581. + CUMETHOD(cuvidMapVideoFrame);
  1582. + CUMETHOD(cuvidUnmapVideoFrame);
  1583. + } cuda;
  1584. +
  1585. + IDirect3D9 *m_pD3D;
  1586. +
  1587. + CUcontext m_cudaContext;
  1588. + CUvideoctxlock m_cudaCtxLock;
  1589. +
  1590. + CUvideoparser m_hParser;
  1591. + CUVIDEOFORMATEX m_VideoParserExInfo;
  1592. +
  1593. + CUvideodecoder m_hDecoder;
  1594. + CUVIDDECODECREATEINFO m_VideoDecoderInfo;
  1595. +
  1596. + CUVIDEOFORMAT m_VideoFormat;
  1597. +
  1598. + CUVIDPARSERDISPINFO m_DisplayQueue[DISPLAY_DELAY];
  1599. + int m_DisplayPos;
  1600. +
  1601. + CUVIDPICPARAMS m_PicParams[MAX_PIC_INDEX];
  1602. +
  1603. + CUstream m_hStream;
  1604. +
  1605. + BOOL m_bVDPAULevelC;
  1606. +
  1607. + BOOL m_bForceSequenceUpdate;
  1608. + BOOL m_bInterlaced;
  1609. + BOOL m_bDoubleRateDeint;
  1610. + BOOL m_bFlushing;
  1611. + REFERENCE_TIME m_rtAvgTimePerFrame;
  1612. + REFERENCE_TIME m_rtPrevDiff;
  1613. + BOOL m_bWaitForKeyframe;
  1614. + int m_iFullRange;
  1615. +
  1616. + DXVA2_ExtendedFormat m_DXVAExtendedFormat;
  1617. +
  1618. + BYTE *m_pbRawNV12;
  1619. + int m_cRawNV12;
  1620. +
  1621. + CAVC1AnnexBConverter *m_AVC1Converter;
  1622. +
  1623. + BOOL m_bFormatIncompatible;
  1624. + BOOL m_bNeedSequenceCheck;
  1625. +
  1626. + BOOL m_bUseTimestampQueue;
  1627. + std::queue<REFERENCE_TIME> m_timestampQueue;
  1628. +
  1629. + int m_nSoftTelecine;
  1630. + BOOL m_bTFF;
  1631. + BOOL m_bARPresent;
  1632. +
  1633. + int m_AccelDeintOutput;
  1634. + BOOL m_DeintTreatAsProgressive;
  1635. + BOOL m_DeintAggressive;
  1636. + BOOL m_DeintForce;
  1637. + DeintFieldOrder m_DeIntFieldOrder;
  1638. +
  1639. + // CUDA Callbacks
  1640. + static int CUDAAPI HandleVideoSequence(void *obj, CUVIDEOFORMAT *cuvidfmt);
  1641. + static int CUDAAPI HandlePictureDecode(void *obj, CUVIDPICPARAMS *cuvidpic);
  1642. + static int CUDAAPI HandlePictureDisplay(void *obj, CUVIDPARSERDISPINFO *cuviddisp);
  1643. +
  1644. +
  1645. + bool Display(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture);
  1646. + bool Deliver(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture ,int field = 0);
  1647. +
  1648. + bool CreateCUVIDDecoder(cudaVideoCodec codec, DWORD dwWidth, DWORD dwHeight, DWORD dwDisplayWidth, DWORD dwDisplayHeight, RECT rcDisplayArea);
  1649. + bool DecodeSequenceData();
  1650. + bool CheckH264Sequence(const BYTE *buffer, int buflen);
  1651. + //STDMETHODIMP FlushParser();
  1652. + CUVIDPARSERDISPINFO* GetNextFrame();
  1653. + DllAvUtil m_dllAvUtil;
  1654. +};
  1655. +
  1656. +
  1657. +};
  1658. +#endif
  1659. \ No newline at end of file
  1660. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.cpp
  1661. new file mode 100644
  1662. index 0000000..71de694
  1663. --- /dev/null
  1664. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.cpp
  1665. @@ -0,0 +1,96 @@
  1666. +/*
  1667. + * Copyright (C) 2010-2012 Hendrik Leppkes
  1668. + * http://www.1f0.de
  1669. + *
  1670. + * This program is free software; you can redistribute it and/or modify
  1671. + * it under the terms of the GNU General Public License as published by
  1672. + * the Free Software Foundation; either version 2 of the License, or
  1673. + * (at your option) any later version.
  1674. + *
  1675. + * This program is distributed in the hope that it will be useful,
  1676. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1677. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1678. + * GNU General Public License for more details.
  1679. + *
  1680. + * You should have received a copy of the GNU General Public License along
  1681. + * with this program; if not, write to the Free Software Foundation, Inc.,
  1682. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  1683. + */
  1684. +
  1685. +
  1686. +#include "AVC1AnnexBConverter.h"
  1687. +
  1688. +
  1689. +
  1690. +#include "libavutil/intreadwrite.h"
  1691. +
  1692. +CAVC1AnnexBConverter::CAVC1AnnexBConverter(void)
  1693. +{
  1694. + m_dllAvUtil.Load();
  1695. +}
  1696. +
  1697. +CAVC1AnnexBConverter::~CAVC1AnnexBConverter(void)
  1698. +{
  1699. +
  1700. +}
  1701. +
  1702. +HRESULT CAVC1AnnexBConverter::alloc_and_copy(uint8_t **poutbuf, int *poutbuf_size, const uint8_t *in, uint32_t in_size)
  1703. +{
  1704. + uint32_t offset = *poutbuf_size;
  1705. + uint8_t nal_header_size = offset ? 3 : 4;
  1706. + void *tmp;
  1707. +
  1708. + *poutbuf_size += in_size+nal_header_size;
  1709. + tmp = m_dllAvUtil.av_realloc(*poutbuf, *poutbuf_size);
  1710. + if (!tmp)
  1711. + return E_OUTOFMEMORY;
  1712. + *poutbuf = (uint8_t *)tmp;
  1713. + memcpy(*poutbuf+nal_header_size+offset, in, in_size);
  1714. + if (!offset) {
  1715. + AV_WB32(*poutbuf, 1);
  1716. + } else {
  1717. + (*poutbuf+offset)[0] = (*poutbuf+offset)[1] = 0;
  1718. + (*poutbuf+offset)[2] = 1;
  1719. + }
  1720. +
  1721. + return S_OK;
  1722. +}
  1723. +
  1724. +HRESULT CAVC1AnnexBConverter::Convert(BYTE **poutbuf, int *poutbuf_size, const BYTE *buf, int buf_size)
  1725. +{
  1726. + int32_t nal_size;
  1727. + const uint8_t *buf_end = buf + buf_size;
  1728. +
  1729. + *poutbuf_size = 0;
  1730. +
  1731. + do {
  1732. + if (buf + m_NaluSize > buf_end)
  1733. + goto fail;
  1734. +
  1735. + if (m_NaluSize == 1) {
  1736. + nal_size = buf[0];
  1737. + } else if (m_NaluSize == 2) {
  1738. + nal_size = AV_RB16(buf);
  1739. + } else {
  1740. + nal_size = AV_RB32(buf);
  1741. + if (m_NaluSize == 3)
  1742. + nal_size >>= 8;
  1743. + }
  1744. +
  1745. + buf += m_NaluSize;
  1746. +
  1747. + if (buf + nal_size > buf_end || nal_size < 0)
  1748. + goto fail;
  1749. +
  1750. + if (FAILED(alloc_and_copy(poutbuf, poutbuf_size, buf, nal_size)))
  1751. + goto fail;
  1752. +
  1753. + buf += nal_size;
  1754. + buf_size -= (nal_size + m_NaluSize);
  1755. + } while (buf_size > 0);
  1756. +
  1757. + return S_OK;
  1758. +fail:
  1759. + m_dllAvUtil.av_freep(poutbuf);
  1760. + return E_FAIL;
  1761. +}
  1762. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.h
  1763. new file mode 100644
  1764. index 0000000..dd45b17
  1765. --- /dev/null
  1766. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.h
  1767. @@ -0,0 +1,34 @@
  1768. +/*
  1769. + * Copyright (C) 2010-2012 Hendrik Leppkes
  1770. + * http://www.1f0.de
  1771. + *
  1772. + * This program is free software; you can redistribute it and/or modify
  1773. + * it under the terms of the GNU General Public License as published by
  1774. + * the Free Software Foundation; either version 2 of the License, or
  1775. + * (at your option) any later version.
  1776. + *
  1777. + * This program is distributed in the hope that it will be useful,
  1778. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1779. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1780. + * GNU General Public License for more details.
  1781. + *
  1782. + * You should have received a copy of the GNU General Public License along
  1783. + * with this program; if not, write to the Free Software Foundation, Inc.,
  1784. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  1785. + */
  1786. +
  1787. +#pragma once
  1788. +#include "DllAvUtil.h"
  1789. +class CAVC1AnnexBConverter
  1790. +{
  1791. +public:
  1792. + CAVC1AnnexBConverter(void);
  1793. + ~CAVC1AnnexBConverter(void);
  1794. +
  1795. + HRESULT SetNALUSize(int nalusize) { m_NaluSize = nalusize; return S_OK; }
  1796. + HRESULT Convert(BYTE **poutbuf, int *poutbuf_size, const BYTE *buf, int buf_size);
  1797. + HRESULT alloc_and_copy(uint8_t **poutbuf, int *poutbuf_size, const uint8_t *in, uint32_t in_size);
  1798. +private:
  1799. + int m_NaluSize;
  1800. + DllAvUtil m_dllAvUtil;
  1801. +};
  1802. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.cpp
  1803. new file mode 100644
  1804. index 0000000..bd8b2b8
  1805. --- /dev/null
  1806. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.cpp
  1807. @@ -0,0 +1,101 @@
  1808. +/*
  1809. + * Copyright (C) 2010-2012 Hendrik Leppkes
  1810. + * http://www.1f0.de
  1811. + *
  1812. + * This program is free software; you can redistribute it and/or modify
  1813. + * it under the terms of the GNU General Public License as published by
  1814. + * the Free Software Foundation; either version 2 of the License, or
  1815. + * (at your option) any later version.
  1816. + *
  1817. + * This program is distributed in the hope that it will be useful,
  1818. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1819. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1820. + * GNU General Public License for more details.
  1821. + *
  1822. + * You should have received a copy of the GNU General Public License along
  1823. + * with this program; if not, write to the Free Software Foundation, Inc.,
  1824. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  1825. + *
  1826. + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
  1827. + */
  1828. +
  1829. +#include "ByteParser.h"
  1830. +
  1831. +#pragma warning( push )
  1832. +#pragma warning( disable : 4018 )
  1833. +#pragma warning( disable : 4244 )
  1834. +extern "C" {
  1835. +#define AVCODEC_X86_MATHOPS_H
  1836. +#define __STDC_CONSTANT_MACROS
  1837. +
  1838. +#include "libavcodec/get_bits.h"
  1839. +};
  1840. +#pragma warning( pop )
  1841. +
  1842. +CByteParser::CByteParser(const BYTE *pData, size_t length)
  1843. + : m_pData(pData), m_pEnd(pData+length)
  1844. +{
  1845. + m_dllAvUtil.Load();
  1846. + m_gbCtx = (GetBitContext *)m_dllAvUtil.av_mallocz(sizeof(GetBitContext));
  1847. + init_get_bits(m_gbCtx, pData, (int)(length << 3));
  1848. +}
  1849. +
  1850. +CByteParser::~CByteParser()
  1851. +{
  1852. + m_dllAvUtil.av_freep(&m_gbCtx);
  1853. +}
  1854. +
  1855. +unsigned int CByteParser::BitRead(unsigned int numBits, bool peek)
  1856. +{
  1857. + if (numBits == 0)
  1858. + return 0;
  1859. +
  1860. + if (peek)
  1861. + return show_bits_long(m_gbCtx, numBits);
  1862. + else
  1863. + return get_bits_long(m_gbCtx, numBits);
  1864. +}
  1865. +
  1866. +size_t CByteParser::RemainingBits() const
  1867. +{
  1868. + return get_bits_left(m_gbCtx);
  1869. +}
  1870. +
  1871. +size_t CByteParser::Pos() const
  1872. +{
  1873. + return (size_t)(m_pEnd - m_pData - Remaining());
  1874. +}
  1875. +
  1876. +// Exponential Golomb Coding (with k = 0)
  1877. +// As used in H.264/MPEG-4 AVC
  1878. +// http://en.wikipedia.org/wiki/Exponential-Golomb_coding
  1879. +
  1880. +unsigned CByteParser::UExpGolombRead()
  1881. +{
  1882. + int n = -1;
  1883. + for(BYTE b = 0; !b && RemainingBits(); n++) {
  1884. + b = get_bits1(m_gbCtx);
  1885. + }
  1886. + if (!RemainingBits())
  1887. + return 0;
  1888. + return ((1 << n) | BitRead(n)) - 1;
  1889. +}
  1890. +
  1891. +int CByteParser::SExpGolombRead()
  1892. +{
  1893. + int k = UExpGolombRead() + 1;
  1894. + // Negative numbers are interleaved in the series
  1895. + // unsigned: 0, 1, 2, 3, 4, 5, 6, ...
  1896. + // signed: 0, 1, -1, 2, -2, 3, -3, ....
  1897. + // So all even numbers are negative (last bit = 0)
  1898. + // Note that we added 1 to the unsigned value already, so the check is inverted
  1899. + if (k&1)
  1900. + return -(k>>1);
  1901. + else
  1902. + return (k>>1);
  1903. +}
  1904. +
  1905. +void CByteParser::BitByteAlign()
  1906. +{
  1907. + align_get_bits(m_gbCtx);
  1908. +}
  1909. \ No newline at end of file
  1910. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.h
  1911. new file mode 100644
  1912. index 0000000..f1c927a
  1913. --- /dev/null
  1914. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.h
  1915. @@ -0,0 +1,69 @@
  1916. +/*
  1917. + * Copyright (C) 2010-2012 Hendrik Leppkes
  1918. + * http://www.1f0.de
  1919. + *
  1920. + * This program is free software; you can redistribute it and/or modify
  1921. + * it under the terms of the GNU General Public License as published by
  1922. + * the Free Software Foundation; either version 2 of the License, or
  1923. + * (at your option) any later version.
  1924. + *
  1925. + * This program is distributed in the hope that it will be useful,
  1926. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1927. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1928. + * GNU General Public License for more details.
  1929. + *
  1930. + * You should have received a copy of the GNU General Public License along
  1931. + * with this program; if not, write to the Free Software Foundation, Inc.,
  1932. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  1933. + *
  1934. + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
  1935. + */
  1936. +
  1937. +#pragma once
  1938. +
  1939. +#include "DllAvUtil.h"
  1940. +
  1941. +struct GetBitContext;
  1942. +
  1943. +/**
  1944. +* Byte Parser Utility Class
  1945. +*/
  1946. +class CByteParser
  1947. +{
  1948. +public:
  1949. + /** Construct a Byte Parser to parse the given BYTE array with the given length */
  1950. + CByteParser(const BYTE *pData, size_t length);
  1951. + virtual ~CByteParser();
  1952. +
  1953. + /** Read 1 to 32 Bits from the Byte Array. If peek is set, the data will just be returned, and the buffer not advanced. */
  1954. + unsigned int BitRead(unsigned int numBits, bool peek = false);
  1955. +
  1956. + /** Read a unsigned number in Exponential Golomb encoding (with k = 0) */
  1957. + unsigned int UExpGolombRead();
  1958. + /** Read a signed number in Exponential Golomb encoding (with k = 0) */
  1959. + int SExpGolombRead();
  1960. +
  1961. + /** Pointer to the start of the byte array */
  1962. + const BYTE *Start() const { return m_pData; }
  1963. + /** Pointer to the end of the byte array */
  1964. + const BYTE *End() const { return m_pEnd; }
  1965. +
  1966. + /** Overall length (in bytes) of the byte array */
  1967. + size_t Length() const;
  1968. +
  1969. + size_t Pos() const;
  1970. +
  1971. + /** Number of bytes remaining in the array */
  1972. + size_t Remaining() const { return RemainingBits() >> 3; }
  1973. +
  1974. + /** Number of bits remaining */
  1975. + size_t RemainingBits() const;
  1976. +
  1977. + void BitByteAlign();
  1978. +
  1979. +private:
  1980. + GetBitContext *m_gbCtx;
  1981. + DllAvUtil m_dllAvUtil;
  1982. + const BYTE *m_pData;
  1983. + const BYTE *m_pEnd;
  1984. +};
  1985. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.cpp
  1986. new file mode 100644
  1987. index 0000000..7524d92
  1988. --- /dev/null
  1989. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.cpp
  1990. @@ -0,0 +1,106 @@
  1991. +/*
  1992. + * Copyright (C) 2010-2012 Hendrik Leppkes
  1993. + * http://www.1f0.de
  1994. + *
  1995. + * This program is free software; you can redistribute it and/or modify
  1996. + * it under the terms of the GNU General Public License as published by
  1997. + * the Free Software Foundation; either version 2 of the License, or
  1998. + * (at your option) any later version.
  1999. + *
  2000. + * This program is distributed in the hope that it will be useful,
  2001. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2002. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2003. + * GNU General Public License for more details.
  2004. + *
  2005. + * You should have received a copy of the GNU General Public License along
  2006. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2007. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2008. + *
  2009. + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
  2010. + */
  2011. +
  2012. +
  2013. +#include "H264Nalu.h"
  2014. +
  2015. +void CH264Nalu::SetBuffer(const BYTE* pBuffer, size_t nSize, int nNALSize)
  2016. +{
  2017. + m_pBuffer = pBuffer;
  2018. + m_nSize = nSize;
  2019. + m_nNALSize = nNALSize;
  2020. + m_nCurPos = 0;
  2021. + m_nNextRTP = 0;
  2022. +
  2023. + m_nNALStartPos = 0;
  2024. + m_nNALDataPos = 0;
  2025. +
  2026. + if (nNALSize == 0 && nSize > 0)
  2027. + MoveToNextAnnexBStartcode();
  2028. +}
  2029. +
  2030. +bool CH264Nalu::MoveToNextAnnexBStartcode()
  2031. +{
  2032. + if (m_nSize < 4)
  2033. + return false;
  2034. + size_t nBuffEnd = m_nSize - 4;
  2035. +
  2036. + for (size_t i=m_nCurPos; i<nBuffEnd; i++) {
  2037. + if ((*((DWORD*)(m_pBuffer+i)) & 0x00FFFFFF) == 0x00010000) {
  2038. + // Find next AnnexB Nal
  2039. + m_nCurPos = i;
  2040. + return true;
  2041. + }
  2042. + }
  2043. +
  2044. + m_nCurPos = m_nSize;
  2045. + return false;
  2046. +}
  2047. +
  2048. +bool CH264Nalu::MoveToNextRTPStartcode()
  2049. +{
  2050. + if (m_nNextRTP < m_nSize) {
  2051. + m_nCurPos = m_nNextRTP;
  2052. + return true;
  2053. + }
  2054. +
  2055. + m_nCurPos = m_nSize;
  2056. + return false;
  2057. +}
  2058. +
  2059. +bool CH264Nalu::ReadNext()
  2060. +{
  2061. +
  2062. + if (m_nCurPos >= m_nSize) return false;
  2063. +
  2064. + if ((m_nNALSize != 0) && (m_nCurPos == m_nNextRTP))
  2065. + {
  2066. + if (m_nCurPos+m_nNALSize >= m_nSize) return false;
  2067. + // RTP Nalu type : (XX XX) XX XX NAL..., with XX XX XX XX or XX XX equal to NAL size
  2068. + m_nNALStartPos = m_nCurPos;
  2069. + m_nNALDataPos = m_nCurPos + m_nNALSize;
  2070. + unsigned nTemp = 0;
  2071. + for (int i=0; i<m_nNALSize; i++)
  2072. + {
  2073. + nTemp = (nTemp << 8) + m_pBuffer[m_nCurPos++];
  2074. + }
  2075. + m_nNextRTP += nTemp + m_nNALSize;
  2076. + MoveToNextRTPStartcode();
  2077. + }
  2078. + else
  2079. + {
  2080. + // Remove trailing bits
  2081. + while (m_pBuffer[m_nCurPos]==0x00 && ((*((DWORD*)(m_pBuffer+m_nCurPos)) & 0x00FFFFFF) != 0x00010000))
  2082. + m_nCurPos++;
  2083. +
  2084. + // AnnexB Nalu : 00 00 01 NAL...
  2085. + m_nNALStartPos = m_nCurPos;
  2086. + m_nCurPos += 3;
  2087. + m_nNALDataPos = m_nCurPos;
  2088. + MoveToNextAnnexBStartcode();
  2089. + }
  2090. +
  2091. + forbidden_bit = (m_pBuffer[m_nNALDataPos]>>7) & 1;
  2092. + nal_reference_idc = (m_pBuffer[m_nNALDataPos]>>5) & 3;
  2093. + nal_unit_type = (NALU_TYPE) (m_pBuffer[m_nNALDataPos] & 0x1f);
  2094. +
  2095. + return true;
  2096. +}
  2097. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.h
  2098. new file mode 100644
  2099. index 0000000..772c852
  2100. --- /dev/null
  2101. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.h
  2102. @@ -0,0 +1,80 @@
  2103. +/*
  2104. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2105. + * http://www.1f0.de
  2106. + *
  2107. + * This program is free software; you can redistribute it and/or modify
  2108. + * it under the terms of the GNU General Public License as published by
  2109. + * the Free Software Foundation; either version 2 of the License, or
  2110. + * (at your option) any later version.
  2111. + *
  2112. + * This program is distributed in the hope that it will be useful,
  2113. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2114. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2115. + * GNU General Public License for more details.
  2116. + *
  2117. + * You should have received a copy of the GNU General Public License along
  2118. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2119. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2120. + *
  2121. + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
  2122. + */
  2123. +
  2124. +#pragma once
  2125. +
  2126. +
  2127. +typedef enum
  2128. +{
  2129. + NALU_TYPE_SLICE = 1,
  2130. + NALU_TYPE_DPA = 2,
  2131. + NALU_TYPE_DPB = 3,
  2132. + NALU_TYPE_DPC = 4,
  2133. + NALU_TYPE_IDR = 5,
  2134. + NALU_TYPE_SEI = 6,
  2135. + NALU_TYPE_SPS = 7,
  2136. + NALU_TYPE_PPS = 8,
  2137. + NALU_TYPE_AUD = 9,
  2138. + NALU_TYPE_EOSEQ = 10,
  2139. + NALU_TYPE_EOSTREAM = 11,
  2140. + NALU_TYPE_FILL = 12
  2141. +} NALU_TYPE;
  2142. +
  2143. +
  2144. +class CH264Nalu
  2145. +{
  2146. +private :
  2147. + int forbidden_bit; //! should be always FALSE
  2148. + int nal_reference_idc; //! NALU_PRIORITY_xxxx
  2149. + NALU_TYPE nal_unit_type; //! NALU_TYPE_xxxx
  2150. +
  2151. + size_t m_nNALStartPos; //! NALU start (including startcode / size)
  2152. + size_t m_nNALDataPos; //! Useful part
  2153. +
  2154. + const BYTE *m_pBuffer;
  2155. + size_t m_nCurPos;
  2156. + size_t m_nNextRTP;
  2157. + size_t m_nSize;
  2158. + int m_nNALSize;
  2159. +
  2160. + bool MoveToNextAnnexBStartcode();
  2161. + bool MoveToNextRTPStartcode();
  2162. +
  2163. +public :
  2164. + CH264Nalu() { SetBuffer(NULL, 0, 0); }
  2165. + NALU_TYPE GetType() const { return nal_unit_type; }
  2166. + bool IsRefFrame() const { return (nal_reference_idc != 0); }
  2167. +
  2168. + size_t GetDataLength() const { return m_nCurPos - m_nNALDataPos; }
  2169. + const BYTE *GetDataBuffer() { return m_pBuffer + m_nNALDataPos; }
  2170. + size_t GetRoundedDataLength() const
  2171. + {
  2172. + size_t nSize = m_nCurPos - m_nNALDataPos;
  2173. + return nSize + 128 - (nSize %128);
  2174. + }
  2175. +
  2176. + size_t GetLength() const { return m_nCurPos - m_nNALStartPos; }
  2177. + const BYTE *GetNALBuffer() { return m_pBuffer + m_nNALStartPos; }
  2178. + bool IsEOF() const { return m_nCurPos >= m_nSize; }
  2179. +
  2180. + void SetBuffer (const BYTE *pBuffer, size_t nSize, int nNALSize);
  2181. + bool ReadNext();
  2182. +};
  2183. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.cpp
  2184. new file mode 100644
  2185. index 0000000..f25700e
  2186. --- /dev/null
  2187. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.cpp
  2188. @@ -0,0 +1,186 @@
  2189. +/*
  2190. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2191. + * http://www.1f0.de
  2192. + *
  2193. + * This program is free software; you can redistribute it and/or modify
  2194. + * it under the terms of the GNU General Public License as published by
  2195. + * the Free Software Foundation; either version 2 of the License, or
  2196. + * (at your option) any later version.
  2197. + *
  2198. + * This program is distributed in the hope that it will be useful,
  2199. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2200. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2201. + * GNU General Public License for more details.
  2202. + *
  2203. + * You should have received a copy of the GNU General Public License along
  2204. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2205. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2206. + */
  2207. +
  2208. +
  2209. +#include "H264SequenceParser.h"
  2210. +
  2211. +#include "ByteParser.h"
  2212. +#include "H264Nalu.h"
  2213. +#include "DllAvCodec.h"
  2214. +
  2215. +CH264SequenceParser::CH264SequenceParser(void)
  2216. +{
  2217. + ZeroMemory(&sps, sizeof(sps));
  2218. + ZeroMemory(&pps, sizeof(pps));
  2219. +}
  2220. +
  2221. +
  2222. +CH264SequenceParser::~CH264SequenceParser(void)
  2223. +{
  2224. +}
  2225. +
  2226. +HRESULT CH264SequenceParser::ParseNALs(const BYTE *buffer, size_t buflen, int nal_size)
  2227. +{
  2228. + CH264Nalu nalu;
  2229. + nalu.SetBuffer(buffer, buflen, nal_size);
  2230. +
  2231. + while (nalu.ReadNext()) {
  2232. + const BYTE *data = nalu.GetDataBuffer() + 1;
  2233. + const size_t len = nalu.GetDataLength() - 1;
  2234. + if (nalu.GetType() == NALU_TYPE_SPS) {
  2235. + ParseSPS(data, len);
  2236. + break;
  2237. + }
  2238. + }
  2239. +
  2240. + return S_OK;
  2241. +}
  2242. +
  2243. +static void SPSDecodeScalingList(CByteParser &parser, int size) {
  2244. + int i, last = 8, next = 8;
  2245. + int matrix = parser.BitRead(1);
  2246. + if (matrix) {
  2247. + for (i = 0; i < size; i++) {
  2248. + if(next)
  2249. + next = (last + parser.SExpGolombRead()) & 0xff;
  2250. + if(!i && !next){ /* matrix not written */
  2251. + break;
  2252. + }
  2253. + last = next ? next : last;
  2254. + }
  2255. + }
  2256. +}
  2257. +
  2258. +HRESULT CH264SequenceParser::ParseSPS(const BYTE *buffer, size_t buflen)
  2259. +{
  2260. + CByteParser parser(buffer, buflen);
  2261. + int i;
  2262. +
  2263. + ZeroMemory(&sps, sizeof(sps));
  2264. + // Defaults
  2265. + sps.valid = 1;
  2266. + sps.primaries = AVCOL_PRI_UNSPECIFIED;
  2267. + sps.trc = AVCOL_TRC_UNSPECIFIED;
  2268. + sps.colorspace = AVCOL_SPC_UNSPECIFIED;
  2269. + sps.full_range = -1;
  2270. +
  2271. + // Parse
  2272. + sps.profile = parser.BitRead(8);
  2273. + parser.BitRead(4); // constraint flags
  2274. + parser.BitRead(4); // reserved
  2275. + sps.level = parser.BitRead(8);
  2276. + parser.UExpGolombRead(); // sps id
  2277. +
  2278. + if (sps.profile >= 100) {
  2279. + sps.chroma = (int)parser.UExpGolombRead();
  2280. + if (sps.chroma == 3)
  2281. + parser.BitRead(1);
  2282. + sps.luma_bitdepth = (int)parser.UExpGolombRead() + 8;
  2283. + sps.chroma_bitdepth = (int)parser.UExpGolombRead() + 8;
  2284. + parser.BitRead(1); // transform_bypass
  2285. +
  2286. + // decode scaling matrices
  2287. + int scaling = parser.BitRead(1);
  2288. + if (scaling) {
  2289. + // Decode scaling lists
  2290. + SPSDecodeScalingList(parser, 16); // Intra, Y
  2291. + SPSDecodeScalingList(parser, 16); // Intra, Cr
  2292. + SPSDecodeScalingList(parser, 16); // Intra, Cb
  2293. + SPSDecodeScalingList(parser, 16); // Inter, Y
  2294. + SPSDecodeScalingList(parser, 16); // Inter, Cr
  2295. + SPSDecodeScalingList(parser, 16); // Inter, Cb
  2296. +
  2297. + SPSDecodeScalingList(parser, 64); // Intra, Y
  2298. + if (sps.chroma == 3) {
  2299. + SPSDecodeScalingList(parser, 64); // Intra, Cr
  2300. + SPSDecodeScalingList(parser, 64); // Intra, Cb
  2301. + }
  2302. + SPSDecodeScalingList(parser, 64); // Inter, Y
  2303. + if (sps.chroma == 3) {
  2304. + SPSDecodeScalingList(parser, 64); // Inter, Cr
  2305. + SPSDecodeScalingList(parser, 64); // Inter, Cb
  2306. + }
  2307. + }
  2308. + } else {
  2309. + sps.chroma = 1;
  2310. + sps.luma_bitdepth = 8;
  2311. + sps.chroma_bitdepth = 8;
  2312. + }
  2313. +
  2314. + parser.UExpGolombRead(); // log2_max_frame_num
  2315. + int poc_type = (int)parser.UExpGolombRead(); // poc_type
  2316. + if (poc_type == 0)
  2317. + parser.UExpGolombRead(); // log2_max_poc_lsb
  2318. + else if (poc_type == 1) {
  2319. + parser.BitRead(1); // delta_pic_order_always_zero_flag
  2320. + parser.SExpGolombRead(); // offset_for_non_ref_pic
  2321. + parser.SExpGolombRead(); // offset_for_top_to_bottom_field
  2322. + int cyclen = (int)parser.UExpGolombRead(); // poc_cycle_length
  2323. + for (i = 0; i < cyclen; i++)
  2324. + parser.SExpGolombRead(); // offset_for_ref_frame[i]
  2325. + }
  2326. +
  2327. + sps.ref_frames = parser.UExpGolombRead(); // ref_frame_count
  2328. + parser.BitRead(1); // gaps_in_frame_num_allowed_flag
  2329. + parser.UExpGolombRead(); // mb_width
  2330. + parser.UExpGolombRead(); // mb_height
  2331. + sps.interlaced = !parser.BitRead(1); // frame_mbs_only_flag
  2332. + if (sps.interlaced)
  2333. + parser.BitRead(1); // mb_aff
  2334. +
  2335. + parser.BitRead(1); // direct_8x8_inference_flag
  2336. + int crop = parser.BitRead(1); // crop
  2337. + if (crop) {
  2338. + parser.UExpGolombRead(); // crop_left
  2339. + parser.UExpGolombRead(); // crop_right
  2340. + parser.UExpGolombRead(); // crop_top
  2341. + parser.UExpGolombRead(); // crop_bottom
  2342. + }
  2343. +
  2344. + int vui_present = parser.BitRead(1); // vui_parameters_present_flag
  2345. + if (vui_present) {
  2346. + sps.ar_present = parser.BitRead(1); // aspect_ratio_info_present_flag
  2347. + if (sps.ar_present) {
  2348. + int ar_idc = parser.BitRead(8); // aspect_ratio_idc
  2349. + if (ar_idc == 255) {
  2350. + parser.BitRead(16); // sar.num
  2351. + parser.BitRead(16); // sar.den
  2352. + }
  2353. + }
  2354. +
  2355. + int overscan = parser.BitRead(1); // overscan_info_present_flag
  2356. + if (overscan)
  2357. + parser.BitRead(1); // overscan_appropriate_flag
  2358. +
  2359. + int vid_sig_type = parser.BitRead(1); // video_signal_type_present_flag
  2360. + if (vid_sig_type) {
  2361. + parser.BitRead(3); // video_format
  2362. + sps.full_range = parser.BitRead(1); // video_full_range_flag
  2363. +
  2364. + int colorinfo = parser.BitRead(1); // colour_description_present_flag
  2365. + if (colorinfo) {
  2366. + sps.primaries = parser.BitRead(8);
  2367. + sps.trc = parser.BitRead(8);
  2368. + sps.colorspace = parser.BitRead(8);
  2369. + }
  2370. + }
  2371. + }
  2372. +
  2373. + return S_OK;
  2374. +}
  2375. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.h
  2376. new file mode 100644
  2377. index 0000000..2827bfd
  2378. --- /dev/null
  2379. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.h
  2380. @@ -0,0 +1,55 @@
  2381. +/*
  2382. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2383. + * http://www.1f0.de
  2384. + *
  2385. + * This program is free software; you can redistribute it and/or modify
  2386. + * it under the terms of the GNU General Public License as published by
  2387. + * the Free Software Foundation; either version 2 of the License, or
  2388. + * (at your option) any later version.
  2389. + *
  2390. + * This program is distributed in the hope that it will be useful,
  2391. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2392. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2393. + * GNU General Public License for more details.
  2394. + *
  2395. + * You should have received a copy of the GNU General Public License along
  2396. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2397. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2398. + */
  2399. +
  2400. +#pragma once
  2401. +
  2402. +class CH264SequenceParser
  2403. +{
  2404. +public:
  2405. + CH264SequenceParser(void);
  2406. + virtual ~CH264SequenceParser(void);
  2407. +
  2408. + HRESULT ParseNALs(const BYTE *buffer, size_t buflen, int nal_size);
  2409. +
  2410. +public:
  2411. + struct {
  2412. + int valid;
  2413. +
  2414. + int profile;
  2415. + int level;
  2416. + int chroma;
  2417. + int luma_bitdepth;
  2418. + int chroma_bitdepth;
  2419. + int ref_frames;
  2420. + int interlaced;
  2421. + int ar_present;
  2422. +
  2423. + int full_range;
  2424. + int primaries;
  2425. + int trc;
  2426. + int colorspace;
  2427. + } sps;
  2428. +
  2429. + struct {
  2430. + int valid;
  2431. + } pps;
  2432. +
  2433. +private:
  2434. + HRESULT ParseSPS(const BYTE *buffer, size_t buflen);
  2435. +};
  2436. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.cpp
  2437. new file mode 100644
  2438. index 0000000..6889638
  2439. --- /dev/null
  2440. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.cpp
  2441. @@ -0,0 +1,109 @@
  2442. +/*
  2443. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2444. + * http://www.1f0.de
  2445. + *
  2446. + * This program is free software; you can redistribute it and/or modify
  2447. + * it under the terms of the GNU General Public License as published by
  2448. + * the Free Software Foundation; either version 2 of the License, or
  2449. + * (at your option) any later version.
  2450. + *
  2451. + * This program is distributed in the hope that it will be useful,
  2452. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2453. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2454. + * GNU General Public License for more details.
  2455. + *
  2456. + * You should have received a copy of the GNU General Public License along
  2457. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2458. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2459. + */
  2460. +
  2461. +
  2462. +#include "MPEG2HeaderParser.h"
  2463. +
  2464. +#pragma warning( push )
  2465. +#pragma warning( disable : 4018 )
  2466. +#pragma warning( disable : 4244 )
  2467. +#define AVCODEC_X86_MATHOPS_H
  2468. +#include "libavcodec/get_bits.h"
  2469. +#pragma warning( pop )
  2470. +
  2471. +#define SEQ_START_CODE 0x000001b3
  2472. +#define EXT_START_CODE 0x000001b5
  2473. +
  2474. +static inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
  2475. +{
  2476. + uint32_t mrk = 0xFFFFFFFF;
  2477. +
  2478. + if(end-src < 4) return end;
  2479. + while(src < end){
  2480. + mrk = (mrk << 8) | *src++;
  2481. + if((mrk & ~0xFF) == 0x00000100)
  2482. + return src-4;
  2483. + }
  2484. + return end;
  2485. +}
  2486. +
  2487. +CMPEG2HeaderParser::CMPEG2HeaderParser(const BYTE *pData, size_t length)
  2488. +{
  2489. + memset(&hdr, 0, sizeof(hdr));
  2490. + ParseMPEG2Header(pData, length);
  2491. +}
  2492. +
  2493. +CMPEG2HeaderParser::~CMPEG2HeaderParser(void)
  2494. +{
  2495. +}
  2496. +
  2497. +void CMPEG2HeaderParser::ParseMPEG2Header(const BYTE *pData, size_t length)
  2498. +{
  2499. + if (length < 16)
  2500. + return;
  2501. +
  2502. + GetBitContext gb;
  2503. +
  2504. + const uint8_t *start = pData;
  2505. + const uint8_t *end = start + length;
  2506. + const uint8_t *next = NULL;
  2507. +
  2508. + int size;
  2509. +
  2510. + start = find_next_marker(start, end);
  2511. + next = start;
  2512. +
  2513. + for(; next < end; start = next) {
  2514. + next = find_next_marker(start + 4, end);
  2515. + size = (int)(next - start - 4);
  2516. + if(size <= 0) continue;
  2517. +
  2518. + init_get_bits(&gb, start + 4, (size - 4) * 8);
  2519. +
  2520. + switch(AV_RB32(start)) {
  2521. + case SEQ_START_CODE:
  2522. + MPEG2ParseSequenceHeader(&gb);
  2523. + break;
  2524. + case EXT_START_CODE:
  2525. + MPEG2ParseExtHeader(&gb);
  2526. + break;
  2527. + }
  2528. + }
  2529. +}
  2530. +
  2531. +void CMPEG2HeaderParser::MPEG2ParseSequenceHeader(GetBitContext *gb)
  2532. +{
  2533. +}
  2534. +
  2535. +void CMPEG2HeaderParser::MPEG2ParseExtHeader(GetBitContext *gb)
  2536. +{
  2537. + int startcode = get_bits(gb, 4); // Start Code
  2538. + if (startcode == 1) {
  2539. + hdr.valid = 1;
  2540. +
  2541. + skip_bits(gb, 1); // profile and level esc
  2542. + hdr.profile = get_bits(gb, 3);
  2543. + hdr.level = get_bits(gb, 4);
  2544. +
  2545. + hdr.interlaced = !get_bits1(gb);
  2546. + hdr.chroma = get_bits(gb, 2);
  2547. +
  2548. + // TODO: Fill in other fields, if needed
  2549. + }
  2550. +}
  2551. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.h
  2552. new file mode 100644
  2553. index 0000000..b45837a
  2554. --- /dev/null
  2555. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.h
  2556. @@ -0,0 +1,45 @@
  2557. +/*
  2558. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2559. + * http://www.1f0.de
  2560. + *
  2561. + * This program is free software; you can redistribute it and/or modify
  2562. + * it under the terms of the GNU General Public License as published by
  2563. + * the Free Software Foundation; either version 2 of the License, or
  2564. + * (at your option) any later version.
  2565. + *
  2566. + * This program is distributed in the hope that it will be useful,
  2567. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2568. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2569. + * GNU General Public License for more details.
  2570. + *
  2571. + * You should have received a copy of the GNU General Public License along
  2572. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2573. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2574. + */
  2575. +
  2576. +#pragma once
  2577. +
  2578. +struct GetBitContext;
  2579. +
  2580. +class CMPEG2HeaderParser
  2581. +{
  2582. +public:
  2583. + CMPEG2HeaderParser(const BYTE *pData, size_t length);
  2584. + ~CMPEG2HeaderParser(void);
  2585. +
  2586. +public:
  2587. + struct {
  2588. + int valid;
  2589. +
  2590. + int profile;
  2591. + int level;
  2592. +
  2593. + int interlaced;
  2594. + int chroma;
  2595. + } hdr;
  2596. +
  2597. +private:
  2598. + void ParseMPEG2Header(const BYTE *pData, size_t length);
  2599. + void MPEG2ParseSequenceHeader(GetBitContext *gb);
  2600. + void MPEG2ParseExtHeader(GetBitContext *gb);
  2601. +};
  2602. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.cpp
  2603. new file mode 100644
  2604. index 0000000..e731eab
  2605. --- /dev/null
  2606. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.cpp
  2607. @@ -0,0 +1,203 @@
  2608. +/*
  2609. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2610. + * http://www.1f0.de
  2611. + *
  2612. + * This program is free software; you can redistribute it and/or modify
  2613. + * it under the terms of the GNU General Public License as published by
  2614. + * the Free Software Foundation; either version 2 of the License, or
  2615. + * (at your option) any later version.
  2616. + *
  2617. + * This program is distributed in the hope that it will be useful,
  2618. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2619. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2620. + * GNU General Public License for more details.
  2621. + *
  2622. + * You should have received a copy of the GNU General Public License along
  2623. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2624. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2625. + */
  2626. +
  2627. +
  2628. +#include "VC1HeaderParser.h"
  2629. +
  2630. +#pragma warning( push )
  2631. +#pragma warning( disable : 4018 )
  2632. +#pragma warning( disable : 4244 )
  2633. +extern "C" {
  2634. +#define AVCODEC_X86_MATHOPS_H
  2635. +#include "libavcodec/get_bits.h"
  2636. +//extern __declspec(dllimport) const AVRational ff_vc1_pixel_aspect[16];
  2637. +};
  2638. +#pragma warning( pop )
  2639. +
  2640. +const AVRational ff_vc1_pixel_aspect[16] = {
  2641. + { 0, 1 },
  2642. + { 1, 1 },
  2643. + { 12, 11 },
  2644. + { 10, 11 },
  2645. + { 16, 11 },
  2646. + { 40, 33 },
  2647. + { 24, 11 },
  2648. + { 20, 11 },
  2649. + { 32, 11 },
  2650. + { 80, 33 },
  2651. + { 18, 11 },
  2652. + { 15, 11 },
  2653. + { 64, 33 },
  2654. + { 160, 99 },
  2655. + { 0, 1 },
  2656. + { 0, 1 }
  2657. +};
  2658. +
  2659. +/** Markers used in VC-1 AP frame data */
  2660. +//@{
  2661. +enum VC1Code{
  2662. + VC1_CODE_RES0 = 0x00000100,
  2663. + VC1_CODE_ENDOFSEQ = 0x0000010A,
  2664. + VC1_CODE_SLICE,
  2665. + VC1_CODE_FIELD,
  2666. + VC1_CODE_FRAME,
  2667. + VC1_CODE_ENTRYPOINT,
  2668. + VC1_CODE_SEQHDR,
  2669. +};
  2670. +//@}
  2671. +
  2672. +/** Available Profiles */
  2673. +//@{
  2674. +enum Profile {
  2675. + PROFILE_SIMPLE,
  2676. + PROFILE_MAIN,
  2677. + PROFILE_COMPLEX, ///< TODO: WMV9 specific
  2678. + PROFILE_ADVANCED
  2679. +};
  2680. +//@}
  2681. +
  2682. +#define IS_MARKER(x) (((x) & ~0xFF) == VC1_CODE_RES0)
  2683. +
  2684. +/** Find VC-1 marker in buffer
  2685. +* @return position where next marker starts or end of buffer if no marker found
  2686. +*/
  2687. +static inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
  2688. +{
  2689. + uint32_t mrk = 0xFFFFFFFF;
  2690. +
  2691. + if(end-src < 4) return end;
  2692. + while(src < end){
  2693. + mrk = (mrk << 8) | *src++;
  2694. + if(IS_MARKER(mrk))
  2695. + return src-4;
  2696. + }
  2697. + return end;
  2698. +}
  2699. +
  2700. +static inline int vc1_unescape_buffer(const uint8_t *src, int size, uint8_t *dst)
  2701. +{
  2702. + int dsize = 0, i;
  2703. +
  2704. + if(size < 4){
  2705. + for(dsize = 0; dsize < size; dsize++) *dst++ = *src++;
  2706. + return size;
  2707. + }
  2708. + for(i = 0; i < size; i++, src++) {
  2709. + if(src[0] == 3 && i >= 2 && !src[-1] && !src[-2] && i < size-1 && src[1] < 4) {
  2710. + dst[dsize++] = src[1];
  2711. + src++;
  2712. + i++;
  2713. + } else
  2714. + dst[dsize++] = *src;
  2715. + }
  2716. + return dsize;
  2717. +}
  2718. +
  2719. +CVC1HeaderParser::CVC1HeaderParser(const BYTE *pData, size_t length)
  2720. +{
  2721. + m_dllAvUtil.Load();
  2722. + memset(&hdr, 0, sizeof(hdr));
  2723. + ParseVC1Header(pData, length);
  2724. +}
  2725. +
  2726. +CVC1HeaderParser::~CVC1HeaderParser(void)
  2727. +{
  2728. +}
  2729. +
  2730. +void CVC1HeaderParser::ParseVC1Header(const BYTE *pData, size_t length)
  2731. +{
  2732. + if (length < 16)
  2733. + return;
  2734. +
  2735. + GetBitContext gb;
  2736. +
  2737. + const uint8_t *start = pData;
  2738. + const uint8_t *end = start + length;
  2739. + const uint8_t *next = NULL;
  2740. +
  2741. + int size, buf2_size;
  2742. + uint8_t *buf2;
  2743. +
  2744. + buf2 = (uint8_t *)m_dllAvUtil.av_mallocz(length + 16);//FF_INPUT_BUFFER_PADDING_SIZE);
  2745. +
  2746. + start = find_next_marker(start, end);
  2747. + next = start;
  2748. +
  2749. + for(; next < end; start = next) {
  2750. + next = find_next_marker(start + 4, end);
  2751. + size = (int)(next - start - 4);
  2752. + if(size <= 0) continue;
  2753. + buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
  2754. +
  2755. + init_get_bits(&gb, buf2, buf2_size * 8);
  2756. +
  2757. + switch(AV_RB32(start)) {
  2758. + case VC1_CODE_SEQHDR:
  2759. + VC1ParseSequenceHeader(&gb);
  2760. + break;
  2761. + }
  2762. + }
  2763. + m_dllAvUtil.av_freep(&buf2);
  2764. +}
  2765. +
  2766. +void CVC1HeaderParser::VC1ParseSequenceHeader(GetBitContext *gb)
  2767. +{
  2768. + hdr.profile = get_bits(gb, 2);
  2769. +
  2770. + if (hdr.profile == PROFILE_ADVANCED) {
  2771. + hdr.valid = 1;
  2772. +
  2773. + hdr.level = get_bits(gb, 3);
  2774. + skip_bits(gb, 2); // Chroma Format, only 1 should be set for 4:2:0
  2775. + skip_bits(gb, 3); // frmrtq_postproc
  2776. + skip_bits(gb, 5); // bitrtq_postproc
  2777. + skip_bits1(gb); // postprocflag
  2778. +
  2779. + hdr.width = (get_bits(gb, 12) + 1) << 1;
  2780. + hdr.height = (get_bits(gb, 12) + 1) << 1;
  2781. +
  2782. + hdr.broadcast = get_bits1(gb); // broadcast
  2783. + hdr.interlaced = get_bits1(gb); // interlaced
  2784. +
  2785. + skip_bits1(gb); // tfcntrflag
  2786. + skip_bits1(gb); // finterpflag
  2787. + skip_bits1(gb); // reserved
  2788. + skip_bits1(gb); // psf
  2789. +
  2790. + if (get_bits1(gb)) { // Display Info
  2791. + int w, h, ar = 0;
  2792. + w = get_bits(gb, 14) + 1;
  2793. + h = get_bits(gb, 14) + 1;
  2794. + if (get_bits1(gb))
  2795. + ar = get_bits(gb, 4);
  2796. + if (ar && ar < 14) {
  2797. + hdr.ar = ff_vc1_pixel_aspect[ar];
  2798. + } else if (ar == 15) {
  2799. + w = get_bits(gb, 8) + 1;
  2800. + h = get_bits(gb, 8) + 1;
  2801. + hdr.ar.num = w;
  2802. + hdr.ar.den = h;
  2803. + } else {
  2804. + m_dllAvUtil.av_reduce(&hdr.ar.num, &hdr.ar.den, hdr.height * w, hdr.width * h, 1 << 30);
  2805. + }
  2806. + }
  2807. +
  2808. + // TODO: add other fields
  2809. + }
  2810. +}
  2811. \ No newline at end of file
  2812. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.h
  2813. new file mode 100644
  2814. index 0000000..ef999c7
  2815. --- /dev/null
  2816. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.h
  2817. @@ -0,0 +1,52 @@
  2818. +/*
  2819. + * Copyright (C) 2010-2012 Hendrik Leppkes
  2820. + * http://www.1f0.de
  2821. + *
  2822. + * This program is free software; you can redistribute it and/or modify
  2823. + * it under the terms of the GNU General Public License as published by
  2824. + * the Free Software Foundation; either version 2 of the License, or
  2825. + * (at your option) any later version.
  2826. + *
  2827. + * This program is distributed in the hope that it will be useful,
  2828. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  2829. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2830. + * GNU General Public License for more details.
  2831. + *
  2832. + * You should have received a copy of the GNU General Public License along
  2833. + * with this program; if not, write to the Free Software Foundation, Inc.,
  2834. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  2835. + */
  2836. +
  2837. +#pragma once
  2838. +
  2839. +#include "DllAvUtil.h"
  2840. +
  2841. +struct GetBitContext;
  2842. +
  2843. +class CVC1HeaderParser
  2844. +{
  2845. +public:
  2846. + CVC1HeaderParser(const BYTE *pData, size_t length);
  2847. + ~CVC1HeaderParser(void);
  2848. +
  2849. +public:
  2850. + struct {
  2851. + int valid;
  2852. +
  2853. + int profile;
  2854. + int level;
  2855. +
  2856. + int width;
  2857. + int height;
  2858. +
  2859. + int broadcast;
  2860. + int interlaced;
  2861. +
  2862. + AVRational ar;
  2863. + } hdr;
  2864. +
  2865. +private:
  2866. + void ParseVC1Header(const BYTE *pData, size_t length);
  2867. + void VC1ParseSequenceHeader(GetBitContext *gb);
  2868. + DllAvUtil m_dllAvUtil;
  2869. +};
  2870. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda.h
  2871. new file mode 100644
  2872. index 0000000..4c6505a
  2873. --- /dev/null
  2874. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda.h
  2875. @@ -0,0 +1,8127 @@
  2876. +/*
  2877. + * Copyright 1993-2011 NVIDIA Corporation. All rights reserved.
  2878. + *
  2879. + * NOTICE TO LICENSEE:
  2880. + *
  2881. + * This source code and/or documentation ("Licensed Deliverables") are
  2882. + * subject to NVIDIA intellectual property rights under U.S. and
  2883. + * international Copyright laws.
  2884. + *
  2885. + * These Licensed Deliverables contained herein is PROPRIETARY and
  2886. + * CONFIDENTIAL to NVIDIA and is being provided under the terms and
  2887. + * conditions of a form of NVIDIA software license agreement by and
  2888. + * between NVIDIA and Licensee ("License Agreement") or electronically
  2889. + * accepted by Licensee. Notwithstanding any terms or conditions to
  2890. + * the contrary in the License Agreement, reproduction or disclosure
  2891. + * of the Licensed Deliverables to any third party without the express
  2892. + * written consent of NVIDIA is prohibited.
  2893. + *
  2894. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  2895. + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
  2896. + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
  2897. + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
  2898. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
  2899. + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
  2900. + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  2901. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  2902. + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
  2903. + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
  2904. + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  2905. + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  2906. + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  2907. + * OF THESE LICENSED DELIVERABLES.
  2908. + *
  2909. + * U.S. Government End Users. These Licensed Deliverables are a
  2910. + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
  2911. + * 1995), consisting of "commercial computer software" and "commercial
  2912. + * computer software documentation" as such terms are used in 48
  2913. + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
  2914. + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
  2915. + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
  2916. + * U.S. Government End Users acquire the Licensed Deliverables with
  2917. + * only those rights set forth herein.
  2918. + *
  2919. + * Any use of the Licensed Deliverables in individual and commercial
  2920. + * software must include, in the user documentation and internal
  2921. + * comments to the code, the above Disclaimer and U.S. Government End
  2922. + * Users Notice.
  2923. + */
  2924. +
  2925. +#ifndef __cuda_cuda_h__
  2926. +#define __cuda_cuda_h__
  2927. +
  2928. +#include <stdlib.h>
  2929. +
  2930. +/**
  2931. + * CUDA API versioning support
  2932. + */
  2933. +#if defined(CUDA_FORCE_API_VERSION)
  2934. + #if (CUDA_FORCE_API_VERSION == 3010)
  2935. + #define __CUDA_API_VERSION 3010
  2936. + #else
  2937. + #error "Unsupported value of CUDA_FORCE_API_VERSION"
  2938. + #endif
  2939. +#else
  2940. + #define __CUDA_API_VERSION 4020
  2941. +#endif /* CUDA_FORCE_API_VERSION */
  2942. +
  2943. +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
  2944. + #define cuDeviceTotalMem cuDeviceTotalMem_v2
  2945. + #define cuCtxCreate cuCtxCreate_v2
  2946. + #define cuModuleGetGlobal cuModuleGetGlobal_v2
  2947. + #define cuMemGetInfo cuMemGetInfo_v2
  2948. + #define cuMemAlloc cuMemAlloc_v2
  2949. + #define cuMemAllocPitch cuMemAllocPitch_v2
  2950. + #define cuMemFree cuMemFree_v2
  2951. + #define cuMemGetAddressRange cuMemGetAddressRange_v2
  2952. + #define cuMemAllocHost cuMemAllocHost_v2
  2953. + #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
  2954. + #define cuMemcpyHtoD cuMemcpyHtoD_v2
  2955. + #define cuMemcpyDtoH cuMemcpyDtoH_v2
  2956. + #define cuMemcpyDtoD cuMemcpyDtoD_v2
  2957. + #define cuMemcpyDtoA cuMemcpyDtoA_v2
  2958. + #define cuMemcpyAtoD cuMemcpyAtoD_v2
  2959. + #define cuMemcpyHtoA cuMemcpyHtoA_v2
  2960. + #define cuMemcpyAtoH cuMemcpyAtoH_v2
  2961. + #define cuMemcpyAtoA cuMemcpyAtoA_v2
  2962. + #define cuMemcpyHtoAAsync cuMemcpyHtoAAsync_v2
  2963. + #define cuMemcpyAtoHAsync cuMemcpyAtoHAsync_v2
  2964. + #define cuMemcpy2D cuMemcpy2D_v2
  2965. + #define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
  2966. + #define cuMemcpy3D cuMemcpy3D_v2
  2967. + #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
  2968. + #define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
  2969. + #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
  2970. + #define cuMemcpy2DAsync cuMemcpy2DAsync_v2
  2971. + #define cuMemcpy3DAsync cuMemcpy3DAsync_v2
  2972. + #define cuMemsetD8 cuMemsetD8_v2
  2973. + #define cuMemsetD16 cuMemsetD16_v2
  2974. + #define cuMemsetD32 cuMemsetD32_v2
  2975. + #define cuMemsetD2D8 cuMemsetD2D8_v2
  2976. + #define cuMemsetD2D16 cuMemsetD2D16_v2
  2977. + #define cuMemsetD2D32 cuMemsetD2D32_v2
  2978. + #define cuArrayCreate cuArrayCreate_v2
  2979. + #define cuArrayGetDescriptor cuArrayGetDescriptor_v2
  2980. + #define cuArray3DCreate cuArray3DCreate_v2
  2981. + #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2
  2982. + #define cuTexRefSetAddress cuTexRefSetAddress_v2
  2983. + #define cuTexRefGetAddress cuTexRefGetAddress_v2
  2984. + #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2
  2985. +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
  2986. +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
  2987. + #define cuCtxDestroy cuCtxDestroy_v2
  2988. + #define cuCtxPopCurrent cuCtxPopCurrent_v2
  2989. + #define cuCtxPushCurrent cuCtxPushCurrent_v2
  2990. + #define cuStreamDestroy cuStreamDestroy_v2
  2991. + #define cuEventDestroy cuEventDestroy_v2
  2992. +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
  2993. +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
  2994. + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3
  2995. +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
  2996. +
  2997. +#if !defined(__CUDA_API_VERSION_INTERNAL)
  2998. +#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
  2999. + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2
  3000. +#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
  3001. +#endif /* __CUDA_API_VERSION_INTERNAL */
  3002. +
  3003. +/**
  3004. + * \defgroup CUDA_DRIVER CUDA Driver API
  3005. + *
  3006. + * This section describes the low-level CUDA driver application programming
  3007. + * interface.
  3008. + *
  3009. + * @{
  3010. + */
  3011. +
  3012. +/**
  3013. + * \defgroup CUDA_TYPES Data types used by CUDA driver
  3014. + * @{
  3015. + */
  3016. +
  3017. +/**
  3018. + * CUDA API version number
  3019. + */
  3020. +#define CUDA_VERSION 4020
  3021. +
  3022. +#ifdef __cplusplus
  3023. +extern "C" {
  3024. +#endif
  3025. +
  3026. +/**
  3027. + * CUDA device pointer
  3028. + */
  3029. +#if __CUDA_API_VERSION >= 3020
  3030. +
  3031. +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
  3032. +typedef unsigned long long CUdeviceptr;
  3033. +#else
  3034. +typedef unsigned int CUdeviceptr;
  3035. +#endif
  3036. +
  3037. +#endif /* __CUDA_API_VERSION >= 3020 */
  3038. +
  3039. +typedef int CUdevice; /**< CUDA device */
  3040. +typedef struct CUctx_st *CUcontext; /**< CUDA context */
  3041. +typedef struct CUmod_st *CUmodule; /**< CUDA module */
  3042. +typedef struct CUfunc_st *CUfunction; /**< CUDA function */
  3043. +typedef struct CUarray_st *CUarray; /**< CUDA array */
  3044. +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */
  3045. +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */
  3046. +typedef struct CUevent_st *CUevent; /**< CUDA event */
  3047. +typedef struct CUstream_st *CUstream; /**< CUDA stream */
  3048. +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
  3049. +
  3050. +typedef struct CUuuid_st { /**< CUDA definition of UUID */
  3051. + char bytes[16];
  3052. +} CUuuid;
  3053. +
  3054. +
  3055. +#if __CUDA_API_VERSION >= 4010
  3056. +
  3057. +/**
  3058. + * Interprocess Handles
  3059. + */
  3060. +#define CU_IPC_HANDLE_SIZE 64
  3061. +
  3062. +typedef struct CUipcEventHandle_st {
  3063. + char reserved[CU_IPC_HANDLE_SIZE];
  3064. +} CUipcEventHandle;
  3065. +
  3066. +typedef struct CUipcMemHandle_st {
  3067. + char reserved[CU_IPC_HANDLE_SIZE];
  3068. +} CUipcMemHandle;
  3069. +
  3070. +typedef enum CUipcMem_flags_enum {
  3071. + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
  3072. +} CUipcMem_flags;
  3073. +
  3074. +#endif
  3075. +
  3076. +/**
  3077. + * Context creation flags
  3078. + */
  3079. +typedef enum CUctx_flags_enum {
  3080. + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
  3081. + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
  3082. + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
  3083. + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
  3084. + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling
  3085. + * \deprecated This flag was deprecated as of CUDA 4.0
  3086. + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
  3087. + CU_CTX_SCHED_MASK = 0x07,
  3088. + CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */
  3089. + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */
  3090. + CU_CTX_FLAGS_MASK = 0x1f
  3091. +} CUctx_flags;
  3092. +
  3093. +/**
  3094. + * Event creation flags
  3095. + */
  3096. +typedef enum CUevent_flags_enum {
  3097. + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */
  3098. + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */
  3099. + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
  3100. + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
  3101. +} CUevent_flags;
  3102. +
  3103. +/**
  3104. + * Array formats
  3105. + */
  3106. +typedef enum CUarray_format_enum {
  3107. + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
  3108. + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
  3109. + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
  3110. + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
  3111. + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
  3112. + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
  3113. + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
  3114. + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
  3115. +} CUarray_format;
  3116. +
  3117. +/**
  3118. + * Texture reference addressing modes
  3119. + */
  3120. +typedef enum CUaddress_mode_enum {
  3121. + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
  3122. + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
  3123. + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
  3124. + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */
  3125. +} CUaddress_mode;
  3126. +
  3127. +/**
  3128. + * Texture reference filtering modes
  3129. + */
  3130. +typedef enum CUfilter_mode_enum {
  3131. + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
  3132. + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
  3133. +} CUfilter_mode;
  3134. +
  3135. +/**
  3136. + * Device properties
  3137. + */
  3138. +typedef enum CUdevice_attribute_enum {
  3139. + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
  3140. + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
  3141. + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
  3142. + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */
  3143. + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */
  3144. + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
  3145. + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
  3146. + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
  3147. + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
  3148. + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
  3149. + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
  3150. + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
  3151. + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
  3152. + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
  3153. + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
  3154. + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
  3155. + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
  3156. + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */
  3157. + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */
  3158. + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */
  3159. + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */
  3160. + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */
  3161. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
  3162. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
  3163. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */
  3164. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */
  3165. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */
  3166. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */
  3167. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */
  3168. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */
  3169. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */
  3170. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
  3171. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
  3172. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
  3173. + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */
  3174. + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
  3175. + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */
  3176. + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */
  3177. + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */
  3178. + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */
  3179. + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
  3180. + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
  3181. + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
  3182. + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
  3183. + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */
  3184. + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */
  3185. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */
  3186. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */
  3187. + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */
  3188. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
  3189. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
  3190. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
  3191. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
  3192. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
  3193. + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */
  3194. + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */
  3195. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */
  3196. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */
  3197. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
  3198. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */
  3199. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */
  3200. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */
  3201. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */
  3202. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */
  3203. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */
  3204. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */
  3205. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */
  3206. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */
  3207. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */
  3208. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */
  3209. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */
  3210. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */
  3211. + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
  3212. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */
  3213. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */
  3214. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */
  3215. + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72 /**< Maximum 2D linear texture pitch in bytes */
  3216. +} CUdevice_attribute;
  3217. +
  3218. +/**
  3219. + * Legacy device properties
  3220. + */
  3221. +typedef struct CUdevprop_st {
  3222. + int maxThreadsPerBlock; /**< Maximum number of threads per block */
  3223. + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
  3224. + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
  3225. + int sharedMemPerBlock; /**< Shared memory available per block in bytes */
  3226. + int totalConstantMemory; /**< Constant memory available on device in bytes */
  3227. + int SIMDWidth; /**< Warp size in threads */
  3228. + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */
  3229. + int regsPerBlock; /**< 32-bit registers available per block */
  3230. + int clockRate; /**< Clock frequency in kilohertz */
  3231. + int textureAlign; /**< Alignment requirement for textures */
  3232. +} CUdevprop;
  3233. +
  3234. +/**
  3235. + * Pointer information
  3236. + */
  3237. +typedef enum CUpointer_attribute_enum {
  3238. + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */
  3239. + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */
  3240. + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */
  3241. + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4 /**< The address at which a pointer's memory may be accessed on the host */
  3242. +} CUpointer_attribute;
  3243. +
  3244. +/**
  3245. + * Function properties
  3246. + */
  3247. +typedef enum CUfunction_attribute_enum {
  3248. + /**
  3249. + * The maximum number of threads per block, beyond which a launch of the
  3250. + * function would fail. This number depends on both the function and the
  3251. + * device on which the function is currently loaded.
  3252. + */
  3253. + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
  3254. +
  3255. + /**
  3256. + * The size in bytes of statically-allocated shared memory required by
  3257. + * this function. This does not include dynamically-allocated shared
  3258. + * memory requested by the user at runtime.
  3259. + */
  3260. + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
  3261. +
  3262. + /**
  3263. + * The size in bytes of user-allocated constant memory required by this
  3264. + * function.
  3265. + */
  3266. + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
  3267. +
  3268. + /**
  3269. + * The size in bytes of local memory used by each thread of this function.
  3270. + */
  3271. + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
  3272. +
  3273. + /**
  3274. + * The number of registers used by each thread of this function.
  3275. + */
  3276. + CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
  3277. +
  3278. + /**
  3279. + * The PTX virtual architecture version for which the function was
  3280. + * compiled. This value is the major PTX version * 10 + the minor PTX
  3281. + * version, so a PTX version 1.3 function would return the value 13.
  3282. + * Note that this may return the undefined value of 0 for cubins
  3283. + * compiled prior to CUDA 3.0.
  3284. + */
  3285. + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
  3286. +
  3287. + /**
  3288. + * The binary architecture version for which the function was compiled.
  3289. + * This value is the major binary version * 10 + the minor binary version,
  3290. + * so a binary version 1.3 function would return the value 13. Note that
  3291. + * this will return a value of 10 for legacy cubins that do not have a
  3292. + * properly-encoded binary architecture version.
  3293. + */
  3294. + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
  3295. +
  3296. + CU_FUNC_ATTRIBUTE_MAX
  3297. +} CUfunction_attribute;
  3298. +
  3299. +/**
  3300. + * Function cache configurations
  3301. + */
  3302. +typedef enum CUfunc_cache_enum {
  3303. + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
  3304. + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
  3305. + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */
  3306. + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */
  3307. +} CUfunc_cache;
  3308. +
  3309. +/**
  3310. + * Shared memory configurations
  3311. + */
  3312. +typedef enum CUsharedconfig_enum {
  3313. + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */
  3314. + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */
  3315. + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */
  3316. +} CUsharedconfig;
  3317. +
  3318. +/**
  3319. + * Memory types
  3320. + */
  3321. +typedef enum CUmemorytype_enum {
  3322. + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
  3323. + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
  3324. + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */
  3325. + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
  3326. +} CUmemorytype;
  3327. +
  3328. +/**
  3329. + * Compute Modes
  3330. + */
  3331. +typedef enum CUcomputemode_enum {
  3332. + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
  3333. + CU_COMPUTEMODE_EXCLUSIVE = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
  3334. + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
  3335. + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
  3336. +} CUcomputemode;
  3337. +
  3338. +/**
  3339. + * Online compiler options
  3340. + */
  3341. +typedef enum CUjit_option_enum
  3342. +{
  3343. + /**
  3344. + * Max number of registers that a thread may use.\n
  3345. + * Option type: unsigned int
  3346. + */
  3347. + CU_JIT_MAX_REGISTERS = 0,
  3348. +
  3349. + /**
  3350. + * IN: Specifies minimum number of threads per block to target compilation
  3351. + * for\n
  3352. + * OUT: Returns the number of threads the compiler actually targeted.
  3353. + * This restricts the resource utilization fo the compiler (e.g. max
  3354. + * registers) such that a block with the given number of threads should be
  3355. + * able to launch based on register limitations. Note, this option does not
  3356. + * currently take into account any other resource limitations, such as
  3357. + * shared memory utilization.\n
  3358. + * Option type: unsigned int
  3359. + */
  3360. + CU_JIT_THREADS_PER_BLOCK,
  3361. +
  3362. + /**
  3363. + * Returns a float value in the option of the wall clock time, in
  3364. + * milliseconds, spent creating the cubin\n
  3365. + * Option type: float
  3366. + */
  3367. + CU_JIT_WALL_TIME,
  3368. +
  3369. + /**
  3370. + * Pointer to a buffer in which to print any log messsages from PTXAS
  3371. + * that are informational in nature (the buffer size is specified via
  3372. + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
  3373. + * Option type: char*
  3374. + */
  3375. + CU_JIT_INFO_LOG_BUFFER,
  3376. +
  3377. + /**
  3378. + * IN: Log buffer size in bytes. Log messages will be capped at this size
  3379. + * (including null terminator)\n
  3380. + * OUT: Amount of log buffer filled with messages\n
  3381. + * Option type: unsigned int
  3382. + */
  3383. + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
  3384. +
  3385. + /**
  3386. + * Pointer to a buffer in which to print any log messages from PTXAS that
  3387. + * reflect errors (the buffer size is specified via option
  3388. + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
  3389. + * Option type: char*
  3390. + */
  3391. + CU_JIT_ERROR_LOG_BUFFER,
  3392. +
  3393. + /**
  3394. + * IN: Log buffer size in bytes. Log messages will be capped at this size
  3395. + * (including null terminator)\n
  3396. + * OUT: Amount of log buffer filled with messages\n
  3397. + * Option type: unsigned int
  3398. + */
  3399. + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
  3400. +
  3401. + /**
  3402. + * Level of optimizations to apply to generated code (0 - 4), with 4
  3403. + * being the default and highest level of optimizations.\n
  3404. + * Option type: unsigned int
  3405. + */
  3406. + CU_JIT_OPTIMIZATION_LEVEL,
  3407. +
  3408. + /**
  3409. + * No option value required. Determines the target based on the current
  3410. + * attached context (default)\n
  3411. + * Option type: No option value needed
  3412. + */
  3413. + CU_JIT_TARGET_FROM_CUCONTEXT,
  3414. +
  3415. + /**
  3416. + * Target is chosen based on supplied ::CUjit_target_enum.\n
  3417. + * Option type: unsigned int for enumerated type ::CUjit_target_enum
  3418. + */
  3419. + CU_JIT_TARGET,
  3420. +
  3421. + /**
  3422. + * Specifies choice of fallback strategy if matching cubin is not found.
  3423. + * Choice is based on supplied ::CUjit_fallback_enum.\n
  3424. + * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
  3425. + */
  3426. + CU_JIT_FALLBACK_STRATEGY
  3427. +
  3428. +} CUjit_option;
  3429. +
  3430. +/**
  3431. + * Online compilation targets
  3432. + */
  3433. +typedef enum CUjit_target_enum
  3434. +{
  3435. + CU_TARGET_COMPUTE_10 = 0, /**< Compute device class 1.0 */
  3436. + CU_TARGET_COMPUTE_11, /**< Compute device class 1.1 */
  3437. + CU_TARGET_COMPUTE_12, /**< Compute device class 1.2 */
  3438. + CU_TARGET_COMPUTE_13, /**< Compute device class 1.3 */
  3439. + CU_TARGET_COMPUTE_20, /**< Compute device class 2.0 */
  3440. + CU_TARGET_COMPUTE_21, /**< Compute device class 2.1 */
  3441. + CU_TARGET_COMPUTE_30 /**< Compute device class 3.0 */
  3442. +} CUjit_target;
  3443. +
  3444. +/**
  3445. + * Cubin matching fallback strategies
  3446. + */
  3447. +typedef enum CUjit_fallback_enum
  3448. +{
  3449. + CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
  3450. +
  3451. + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */
  3452. +
  3453. +} CUjit_fallback;
  3454. +
  3455. +/**
  3456. + * Flags to register a graphics resource
  3457. + */
  3458. +typedef enum CUgraphicsRegisterFlags_enum {
  3459. + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
  3460. + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
  3461. + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
  3462. + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04,
  3463. + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
  3464. +} CUgraphicsRegisterFlags;
  3465. +
  3466. +/**
  3467. + * Flags for mapping and unmapping interop resources
  3468. + */
  3469. +typedef enum CUgraphicsMapResourceFlags_enum {
  3470. + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
  3471. + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
  3472. + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
  3473. +} CUgraphicsMapResourceFlags;
  3474. +
  3475. +/**
  3476. + * Array indices for cube faces
  3477. + */
  3478. +typedef enum CUarray_cubemap_face_enum {
  3479. + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
  3480. + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
  3481. + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
  3482. + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */
  3483. + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */
  3484. + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */
  3485. +} CUarray_cubemap_face;
  3486. +
  3487. +/**
  3488. + * Limits
  3489. + */
  3490. +typedef enum CUlimit_enum {
  3491. + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
  3492. + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
  3493. + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
  3494. +} CUlimit;
  3495. +
  3496. +/**
  3497. + * Error codes
  3498. + */
  3499. +typedef enum cudaError_enum {
  3500. + /**
  3501. + * The API call returned with no errors. In the case of query calls, this
  3502. + * can also mean that the operation being queried is complete (see
  3503. + * ::cuEventQuery() and ::cuStreamQuery()).
  3504. + */
  3505. + CUDA_SUCCESS = 0,
  3506. +
  3507. + /**
  3508. + * This indicates that one or more of the parameters passed to the API call
  3509. + * is not within an acceptable range of values.
  3510. + */
  3511. + CUDA_ERROR_INVALID_VALUE = 1,
  3512. +
  3513. + /**
  3514. + * The API call failed because it was unable to allocate enough memory to
  3515. + * perform the requested operation.
  3516. + */
  3517. + CUDA_ERROR_OUT_OF_MEMORY = 2,
  3518. +
  3519. + /**
  3520. + * This indicates that the CUDA driver has not been initialized with
  3521. + * ::cuInit() or that initialization has failed.
  3522. + */
  3523. + CUDA_ERROR_NOT_INITIALIZED = 3,
  3524. +
  3525. + /**
  3526. + * This indicates that the CUDA driver is in the process of shutting down.
  3527. + */
  3528. + CUDA_ERROR_DEINITIALIZED = 4,
  3529. +
  3530. + /**
  3531. + * This indicates profiling APIs are called while application is running
  3532. + * in visual profiler mode.
  3533. + */
  3534. + CUDA_ERROR_PROFILER_DISABLED = 5,
  3535. + /**
  3536. + * This indicates profiling has not been initialized for this context.
  3537. + * Call cuProfilerInitialize() to resolve this.
  3538. + */
  3539. + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
  3540. + /**
  3541. + * This indicates profiler has already been started and probably
  3542. + * cuProfilerStart() is incorrectly called.
  3543. + */
  3544. + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
  3545. + /**
  3546. + * This indicates profiler has already been stopped and probably
  3547. + * cuProfilerStop() is incorrectly called.
  3548. + */
  3549. + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
  3550. + /**
  3551. + * This indicates that no CUDA-capable devices were detected by the installed
  3552. + * CUDA driver.
  3553. + */
  3554. + CUDA_ERROR_NO_DEVICE = 100,
  3555. +
  3556. + /**
  3557. + * This indicates that the device ordinal supplied by the user does not
  3558. + * correspond to a valid CUDA device.
  3559. + */
  3560. + CUDA_ERROR_INVALID_DEVICE = 101,
  3561. +
  3562. +
  3563. + /**
  3564. + * This indicates that the device kernel image is invalid. This can also
  3565. + * indicate an invalid CUDA module.
  3566. + */
  3567. + CUDA_ERROR_INVALID_IMAGE = 200,
  3568. +
  3569. + /**
  3570. + * This most frequently indicates that there is no context bound to the
  3571. + * current thread. This can also be returned if the context passed to an
  3572. + * API call is not a valid handle (such as a context that has had
  3573. + * ::cuCtxDestroy() invoked on it). This can also be returned if a user
  3574. + * mixes different API versions (i.e. 3010 context with 3020 API calls).
  3575. + * See ::cuCtxGetApiVersion() for more details.
  3576. + */
  3577. + CUDA_ERROR_INVALID_CONTEXT = 201,
  3578. +
  3579. + /**
  3580. + * This indicated that the context being supplied as a parameter to the
  3581. + * API call was already the active context.
  3582. + * \deprecated
  3583. + * This error return is deprecated as of CUDA 3.2. It is no longer an
  3584. + * error to attempt to push the active context via ::cuCtxPushCurrent().
  3585. + */
  3586. + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
  3587. +
  3588. + /**
  3589. + * This indicates that a map or register operation has failed.
  3590. + */
  3591. + CUDA_ERROR_MAP_FAILED = 205,
  3592. +
  3593. + /**
  3594. + * This indicates that an unmap or unregister operation has failed.
  3595. + */
  3596. + CUDA_ERROR_UNMAP_FAILED = 206,
  3597. +
  3598. + /**
  3599. + * This indicates that the specified array is currently mapped and thus
  3600. + * cannot be destroyed.
  3601. + */
  3602. + CUDA_ERROR_ARRAY_IS_MAPPED = 207,
  3603. +
  3604. + /**
  3605. + * This indicates that the resource is already mapped.
  3606. + */
  3607. + CUDA_ERROR_ALREADY_MAPPED = 208,
  3608. +
  3609. + /**
  3610. + * This indicates that there is no kernel image available that is suitable
  3611. + * for the device. This can occur when a user specifies code generation
  3612. + * options for a particular CUDA source file that do not include the
  3613. + * corresponding device configuration.
  3614. + */
  3615. + CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
  3616. +
  3617. + /**
  3618. + * This indicates that a resource has already been acquired.
  3619. + */
  3620. + CUDA_ERROR_ALREADY_ACQUIRED = 210,
  3621. +
  3622. + /**
  3623. + * This indicates that a resource is not mapped.
  3624. + */
  3625. + CUDA_ERROR_NOT_MAPPED = 211,
  3626. +
  3627. + /**
  3628. + * This indicates that a mapped resource is not available for access as an
  3629. + * array.
  3630. + */
  3631. + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
  3632. +
  3633. + /**
  3634. + * This indicates that a mapped resource is not available for access as a
  3635. + * pointer.
  3636. + */
  3637. + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
  3638. +
  3639. + /**
  3640. + * This indicates that an uncorrectable ECC error was detected during
  3641. + * execution.
  3642. + */
  3643. + CUDA_ERROR_ECC_UNCORRECTABLE = 214,
  3644. +
  3645. + /**
  3646. + * This indicates that the ::CUlimit passed to the API call is not
  3647. + * supported by the active device.
  3648. + */
  3649. + CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
  3650. +
  3651. + /**
  3652. + * This indicates that the ::CUcontext passed to the API call can
  3653. + * only be bound to a single CPU thread at a time but is already
  3654. + * bound to a CPU thread.
  3655. + */
  3656. + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,
  3657. +
  3658. + /**
  3659. + * This indicates that the device kernel source is invalid.
  3660. + */
  3661. + CUDA_ERROR_INVALID_SOURCE = 300,
  3662. +
  3663. + /**
  3664. + * This indicates that the file specified was not found.
  3665. + */
  3666. + CUDA_ERROR_FILE_NOT_FOUND = 301,
  3667. +
  3668. + /**
  3669. + * This indicates that a link to a shared object failed to resolve.
  3670. + */
  3671. + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
  3672. +
  3673. + /**
  3674. + * This indicates that initialization of a shared object failed.
  3675. + */
  3676. + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
  3677. +
  3678. + /**
  3679. + * This indicates that an OS call failed.
  3680. + */
  3681. + CUDA_ERROR_OPERATING_SYSTEM = 304,
  3682. +
  3683. +
  3684. + /**
  3685. + * This indicates that a resource handle passed to the API call was not
  3686. + * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
  3687. + */
  3688. + CUDA_ERROR_INVALID_HANDLE = 400,
  3689. +
  3690. +
  3691. + /**
  3692. + * This indicates that a named symbol was not found. Examples of symbols
  3693. + * are global/constant variable names, texture names, and surface names.
  3694. + */
  3695. + CUDA_ERROR_NOT_FOUND = 500,
  3696. +
  3697. +
  3698. + /**
  3699. + * This indicates that asynchronous operations issued previously have not
  3700. + * completed yet. This result is not actually an error, but must be indicated
  3701. + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
  3702. + * may return this value include ::cuEventQuery() and ::cuStreamQuery().
  3703. + */
  3704. + CUDA_ERROR_NOT_READY = 600,
  3705. +
  3706. +
  3707. + /**
  3708. + * An exception occurred on the device while executing a kernel. Common
  3709. + * causes include dereferencing an invalid device pointer and accessing
  3710. + * out of bounds shared memory. The context cannot be used, so it must
  3711. + * be destroyed (and a new one should be created). All existing device
  3712. + * memory allocations from this context are invalid and must be
  3713. + * reconstructed if the program is to continue using CUDA.
  3714. + */
  3715. + CUDA_ERROR_LAUNCH_FAILED = 700,
  3716. +
  3717. + /**
  3718. + * This indicates that a launch did not occur because it did not have
  3719. + * appropriate resources. This error usually indicates that the user has
  3720. + * attempted to pass too many arguments to the device kernel, or the
  3721. + * kernel launch specifies too many threads for the kernel's register
  3722. + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
  3723. + * when a 32-bit int is expected) is equivalent to passing too many
  3724. + * arguments and can also result in this error.
  3725. + */
  3726. + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
  3727. +
  3728. + /**
  3729. + * This indicates that the device kernel took too long to execute. This can
  3730. + * only occur if timeouts are enabled - see the device attribute
  3731. + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
  3732. + * context cannot be used (and must be destroyed similar to
  3733. + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
  3734. + * this context are invalid and must be reconstructed if the program is to
  3735. + * continue using CUDA.
  3736. + */
  3737. + CUDA_ERROR_LAUNCH_TIMEOUT = 702,
  3738. +
  3739. + /**
  3740. + * This error indicates a kernel launch that uses an incompatible texturing
  3741. + * mode.
  3742. + */
  3743. + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
  3744. +
  3745. + /**
  3746. + * This error indicates that a call to ::cuCtxEnablePeerAccess() is
  3747. + * trying to re-enable peer access to a context which has already
  3748. + * had peer access to it enabled.
  3749. + */
  3750. + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
  3751. +
  3752. + /**
  3753. + * This error indicates that ::cuCtxDisablePeerAccess() is
  3754. + * trying to disable peer access which has not been enabled yet
  3755. + * via ::cuCtxEnablePeerAccess().
  3756. + */
  3757. + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,
  3758. +
  3759. + /**
  3760. + * This error indicates that the primary context for the specified device
  3761. + * has already been initialized.
  3762. + */
  3763. + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,
  3764. +
  3765. + /**
  3766. + * This error indicates that the context current to the calling thread
  3767. + * has been destroyed using ::cuCtxDestroy, or is a primary context which
  3768. + * has not yet been initialized.
  3769. + */
  3770. + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,
  3771. +
  3772. + /**
  3773. + * A device-side assert triggered during kernel execution. The context
  3774. + * cannot be used anymore, and must be destroyed. All existing device
  3775. + * memory allocations from this context are invalid and must be
  3776. + * reconstructed if the program is to continue using CUDA.
  3777. + */
  3778. + CUDA_ERROR_ASSERT = 710,
  3779. +
  3780. + /**
  3781. + * This error indicates that the hardware resources required to enable
  3782. + * peer access have been exhausted for one or more of the devices
  3783. + * passed to ::cuCtxEnablePeerAccess().
  3784. + */
  3785. + CUDA_ERROR_TOO_MANY_PEERS = 711,
  3786. +
  3787. + /**
  3788. + * This error indicates that the memory range passed to ::cuMemHostRegister()
  3789. + * has already been registered.
  3790. + */
  3791. + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
  3792. +
  3793. + /**
  3794. + * This error indicates that the pointer passed to ::cuMemHostUnregister()
  3795. + * does not correspond to any currently registered memory region.
  3796. + */
  3797. + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713,
  3798. +
  3799. + /**
  3800. + * This indicates that an unknown internal error has occurred.
  3801. + */
  3802. + CUDA_ERROR_UNKNOWN = 999
  3803. +} CUresult;
  3804. +
  3805. +/**
  3806. + * If set, host memory is portable between CUDA contexts.
  3807. + * Flag for ::cuMemHostAlloc()
  3808. + */
  3809. +#define CU_MEMHOSTALLOC_PORTABLE 0x01
  3810. +
  3811. +/**
  3812. + * If set, host memory is mapped into CUDA address space and
  3813. + * ::cuMemHostGetDevicePointer() may be called on the host pointer.
  3814. + * Flag for ::cuMemHostAlloc()
  3815. + */
  3816. +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02
  3817. +
  3818. +/**
  3819. + * If set, host memory is allocated as write-combined - fast to write,
  3820. + * faster to DMA, slow to read except via SSE4 streaming load instruction
  3821. + * (MOVNTDQA).
  3822. + * Flag for ::cuMemHostAlloc()
  3823. + */
  3824. +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04
  3825. +
  3826. +/**
  3827. + * If set, host memory is portable between CUDA contexts.
  3828. + * Flag for ::cuMemHostRegister()
  3829. + */
  3830. +#define CU_MEMHOSTREGISTER_PORTABLE 0x01
  3831. +
  3832. +/**
  3833. + * If set, host memory is mapped into CUDA address space and
  3834. + * ::cuMemHostGetDevicePointer() may be called on the host pointer.
  3835. + * Flag for ::cuMemHostRegister()
  3836. + */
  3837. +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02
  3838. +
  3839. +#if __CUDA_API_VERSION >= 3020
  3840. +
  3841. +/**
  3842. + * 2D memory copy parameters
  3843. + */
  3844. +typedef struct CUDA_MEMCPY2D_st {
  3845. + size_t srcXInBytes; /**< Source X in bytes */
  3846. + size_t srcY; /**< Source Y */
  3847. +
  3848. + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
  3849. + const void *srcHost; /**< Source host pointer */
  3850. + CUdeviceptr srcDevice; /**< Source device pointer */
  3851. + CUarray srcArray; /**< Source array reference */
  3852. + size_t srcPitch; /**< Source pitch (ignored when src is array) */
  3853. +
  3854. + size_t dstXInBytes; /**< Destination X in bytes */
  3855. + size_t dstY; /**< Destination Y */
  3856. +
  3857. + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
  3858. + void *dstHost; /**< Destination host pointer */
  3859. + CUdeviceptr dstDevice; /**< Destination device pointer */
  3860. + CUarray dstArray; /**< Destination array reference */
  3861. + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
  3862. +
  3863. + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
  3864. + size_t Height; /**< Height of 2D memory copy */
  3865. +} CUDA_MEMCPY2D;
  3866. +
  3867. +/**
  3868. + * 3D memory copy parameters
  3869. + */
  3870. +typedef struct CUDA_MEMCPY3D_st {
  3871. + size_t srcXInBytes; /**< Source X in bytes */
  3872. + size_t srcY; /**< Source Y */
  3873. + size_t srcZ; /**< Source Z */
  3874. + size_t srcLOD; /**< Source LOD */
  3875. + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
  3876. + const void *srcHost; /**< Source host pointer */
  3877. + CUdeviceptr srcDevice; /**< Source device pointer */
  3878. + CUarray srcArray; /**< Source array reference */
  3879. + void *reserved0; /**< Must be NULL */
  3880. + size_t srcPitch; /**< Source pitch (ignored when src is array) */
  3881. + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
  3882. +
  3883. + size_t dstXInBytes; /**< Destination X in bytes */
  3884. + size_t dstY; /**< Destination Y */
  3885. + size_t dstZ; /**< Destination Z */
  3886. + size_t dstLOD; /**< Destination LOD */
  3887. + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
  3888. + void *dstHost; /**< Destination host pointer */
  3889. + CUdeviceptr dstDevice; /**< Destination device pointer */
  3890. + CUarray dstArray; /**< Destination array reference */
  3891. + void *reserved1; /**< Must be NULL */
  3892. + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
  3893. + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
  3894. +
  3895. + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
  3896. + size_t Height; /**< Height of 3D memory copy */
  3897. + size_t Depth; /**< Depth of 3D memory copy */
  3898. +} CUDA_MEMCPY3D;
  3899. +
  3900. +/**
  3901. + * 3D memory cross-context copy parameters
  3902. + */
  3903. +typedef struct CUDA_MEMCPY3D_PEER_st {
  3904. + size_t srcXInBytes; /**< Source X in bytes */
  3905. + size_t srcY; /**< Source Y */
  3906. + size_t srcZ; /**< Source Z */
  3907. + size_t srcLOD; /**< Source LOD */
  3908. + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
  3909. + const void *srcHost; /**< Source host pointer */
  3910. + CUdeviceptr srcDevice; /**< Source device pointer */
  3911. + CUarray srcArray; /**< Source array reference */
  3912. + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
  3913. + size_t srcPitch; /**< Source pitch (ignored when src is array) */
  3914. + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
  3915. +
  3916. + size_t dstXInBytes; /**< Destination X in bytes */
  3917. + size_t dstY; /**< Destination Y */
  3918. + size_t dstZ; /**< Destination Z */
  3919. + size_t dstLOD; /**< Destination LOD */
  3920. + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
  3921. + void *dstHost; /**< Destination host pointer */
  3922. + CUdeviceptr dstDevice; /**< Destination device pointer */
  3923. + CUarray dstArray; /**< Destination array reference */
  3924. + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
  3925. + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
  3926. + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
  3927. +
  3928. + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
  3929. + size_t Height; /**< Height of 3D memory copy */
  3930. + size_t Depth; /**< Depth of 3D memory copy */
  3931. +} CUDA_MEMCPY3D_PEER;
  3932. +
  3933. +/**
  3934. + * Array descriptor
  3935. + */
  3936. +typedef struct CUDA_ARRAY_DESCRIPTOR_st
  3937. +{
  3938. + size_t Width; /**< Width of array */
  3939. + size_t Height; /**< Height of array */
  3940. +
  3941. + CUarray_format Format; /**< Array format */
  3942. + unsigned int NumChannels; /**< Channels per array element */
  3943. +} CUDA_ARRAY_DESCRIPTOR;
  3944. +
  3945. +/**
  3946. + * 3D array descriptor
  3947. + */
  3948. +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
  3949. +{
  3950. + size_t Width; /**< Width of 3D array */
  3951. + size_t Height; /**< Height of 3D array */
  3952. + size_t Depth; /**< Depth of 3D array */
  3953. +
  3954. + CUarray_format Format; /**< Array format */
  3955. + unsigned int NumChannels; /**< Channels per array element */
  3956. + unsigned int Flags; /**< Flags */
  3957. +} CUDA_ARRAY3D_DESCRIPTOR;
  3958. +
  3959. +#endif /* __CUDA_API_VERSION >= 3020 */
  3960. +
  3961. +/**
  3962. + * If set, the CUDA array is a collection of layers, where each layer is either a 1D
  3963. + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
  3964. + * of layers, not the depth of a 3D array.
  3965. + */
  3966. +#define CUDA_ARRAY3D_LAYERED 0x01
  3967. +
  3968. +/**
  3969. + * Deprecated, use CUDA_ARRAY3D_LAYERED
  3970. + */
  3971. +#define CUDA_ARRAY3D_2DARRAY 0x01
  3972. +
  3973. +/**
  3974. + * This flag must be set in order to bind a surface reference
  3975. + * to the CUDA array
  3976. + */
  3977. +#define CUDA_ARRAY3D_SURFACE_LDST 0x02
  3978. +
  3979. +/**
  3980. + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
  3981. + * width of such a CUDA array must be equal to its height, and Depth must be six.
  3982. + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
  3983. + * and Depth must be a multiple of six.
  3984. + */
  3985. +#define CUDA_ARRAY3D_CUBEMAP 0x04
  3986. +
  3987. +/**
  3988. + * This flag must be set in order to perform texture gather operations
  3989. + * on a CUDA array.
  3990. + */
  3991. +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
  3992. +
  3993. +/**
  3994. + * Override the texref format with a format inferred from the array.
  3995. + * Flag for ::cuTexRefSetArray()
  3996. + */
  3997. +#define CU_TRSA_OVERRIDE_FORMAT 0x01
  3998. +
  3999. +/**
  4000. + * Read the texture as integers rather than promoting the values to floats
  4001. + * in the range [0,1].
  4002. + * Flag for ::cuTexRefSetFlags()
  4003. + */
  4004. +#define CU_TRSF_READ_AS_INTEGER 0x01
  4005. +
  4006. +/**
  4007. + * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
  4008. + * Flag for ::cuTexRefSetFlags()
  4009. + */
  4010. +#define CU_TRSF_NORMALIZED_COORDINATES 0x02
  4011. +
  4012. +/**
  4013. + * Perform sRGB->linear conversion during texture read.
  4014. + * Flag for ::cuTexRefSetFlags()
  4015. + */
  4016. +#define CU_TRSF_SRGB 0x10
  4017. +
  4018. +/**
  4019. + * End of array terminator for the \p extra parameter to
  4020. + * ::cuLaunchKernel
  4021. + */
  4022. +#define CU_LAUNCH_PARAM_END ((void*)0x00)
  4023. +
  4024. +/**
  4025. + * Indicator that the next value in the \p extra parameter to
  4026. + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
  4027. + * parameters used for launching kernel \p f. This buffer needs to
  4028. + * honor all alignment/padding requirements of the individual parameters.
  4029. + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
  4030. + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
  4031. + * effect.
  4032. + */
  4033. +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
  4034. +
  4035. +/**
  4036. + * Indicator that the next value in the \p extra parameter to
  4037. + * ::cuLaunchKernel will be a pointer to a size_t which contains the
  4038. + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
  4039. + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
  4040. + * in the \p extra array if the value associated with
  4041. + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
  4042. + */
  4043. +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
  4044. +
  4045. +/**
  4046. + * For texture references loaded into the module, use default texunit from
  4047. + * texture reference.
  4048. + */
  4049. +#define CU_PARAM_TR_DEFAULT -1
  4050. +
  4051. +/** @} */ /* END CUDA_TYPES */
  4052. +
  4053. +#ifdef _WIN32
  4054. +#define CUDAAPI __stdcall
  4055. +#else
  4056. +#define CUDAAPI
  4057. +#endif
  4058. +
  4059. +/**
  4060. + * \defgroup CUDA_INITIALIZE Initialization
  4061. + *
  4062. + * This section describes the initialization functions of the low-level CUDA
  4063. + * driver application programming interface.
  4064. + *
  4065. + * @{
  4066. + */
  4067. +
  4068. +/**
  4069. + * \brief Initialize the CUDA driver API
  4070. + *
  4071. + * Initializes the driver API and must be called before any other function from
  4072. + * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
  4073. + * has not been called, any function from the driver API will return
  4074. + * ::CUDA_ERROR_NOT_INITIALIZED.
  4075. + *
  4076. + * \param Flags - Initialization flag for CUDA.
  4077. + *
  4078. + * \return
  4079. + * ::CUDA_SUCCESS,
  4080. + * ::CUDA_ERROR_INVALID_VALUE,
  4081. + * ::CUDA_ERROR_INVALID_DEVICE
  4082. + * \notefnerr
  4083. + */
  4084. +CUresult CUDAAPI cuInit(unsigned int Flags);
  4085. +
  4086. +/** @} */ /* END CUDA_INITIALIZE */
  4087. +
  4088. +/**
  4089. + * \defgroup CUDA_VERSION Version Management
  4090. + *
  4091. + * This section describes the version management functions of the low-level
  4092. + * CUDA driver application programming interface.
  4093. + *
  4094. + * @{
  4095. + */
  4096. +
  4097. +/**
  4098. + * \brief Returns the CUDA driver version
  4099. + *
  4100. + * Returns in \p *driverVersion the version number of the installed CUDA
  4101. + * driver. This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
  4102. + * the \p driverVersion argument is NULL.
  4103. + *
  4104. + * \param driverVersion - Returns the CUDA driver version
  4105. + *
  4106. + * \return
  4107. + * ::CUDA_SUCCESS,
  4108. + * ::CUDA_ERROR_INVALID_VALUE
  4109. + * \notefnerr
  4110. + */
  4111. +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
  4112. +
  4113. +/** @} */ /* END CUDA_VERSION */
  4114. +
  4115. +/**
  4116. + * \defgroup CUDA_DEVICE Device Management
  4117. + *
  4118. + * This section describes the device management functions of the low-level
  4119. + * CUDA driver application programming interface.
  4120. + *
  4121. + * @{
  4122. + */
  4123. +
  4124. +/**
  4125. + * \brief Returns a handle to a compute device
  4126. + *
  4127. + * Returns in \p *device a device handle given an ordinal in the range <b>[0,
  4128. + * ::cuDeviceGetCount()-1]</b>.
  4129. + *
  4130. + * \param device - Returned device handle
  4131. + * \param ordinal - Device number to get handle for
  4132. + *
  4133. + * \return
  4134. + * ::CUDA_SUCCESS,
  4135. + * ::CUDA_ERROR_DEINITIALIZED,
  4136. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4137. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4138. + * ::CUDA_ERROR_INVALID_VALUE,
  4139. + * ::CUDA_ERROR_INVALID_DEVICE
  4140. + * \notefnerr
  4141. + *
  4142. + * \sa ::cuDeviceComputeCapability,
  4143. + * ::cuDeviceGetAttribute,
  4144. + * ::cuDeviceGetCount,
  4145. + * ::cuDeviceGetName,
  4146. + * ::cuDeviceGetProperties,
  4147. + * ::cuDeviceTotalMem
  4148. + */
  4149. +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
  4150. +
  4151. +/**
  4152. + * \brief Returns the number of compute-capable devices
  4153. + *
  4154. + * Returns in \p *count the number of devices with compute capability greater
  4155. + * than or equal to 1.0 that are available for execution. If there is no such
  4156. + * device, ::cuDeviceGetCount() returns 0.
  4157. + *
  4158. + * \param count - Returned number of compute-capable devices
  4159. + *
  4160. + * \return
  4161. + * ::CUDA_SUCCESS,
  4162. + * ::CUDA_ERROR_DEINITIALIZED,
  4163. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4164. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4165. + * ::CUDA_ERROR_INVALID_VALUE
  4166. + * \notefnerr
  4167. + *
  4168. + * \sa ::cuDeviceComputeCapability,
  4169. + * ::cuDeviceGetAttribute,
  4170. + * ::cuDeviceGetName,
  4171. + * ::cuDeviceGet,
  4172. + * ::cuDeviceGetProperties,
  4173. + * ::cuDeviceTotalMem
  4174. + */
  4175. +CUresult CUDAAPI cuDeviceGetCount(int *count);
  4176. +
  4177. +/**
  4178. + * \brief Returns an identifer string for the device
  4179. + *
  4180. + * Returns an ASCII string identifying the device \p dev in the NULL-terminated
  4181. + * string pointed to by \p name. \p len specifies the maximum length of the
  4182. + * string that may be returned.
  4183. + *
  4184. + * \param name - Returned identifier string for the device
  4185. + * \param len - Maximum length of string to store in \p name
  4186. + * \param dev - Device to get identifier string for
  4187. + *
  4188. + * \return
  4189. + * ::CUDA_SUCCESS,
  4190. + * ::CUDA_ERROR_DEINITIALIZED,
  4191. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4192. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4193. + * ::CUDA_ERROR_INVALID_VALUE,
  4194. + * ::CUDA_ERROR_INVALID_DEVICE
  4195. + * \notefnerr
  4196. + *
  4197. + * \sa ::cuDeviceComputeCapability,
  4198. + * ::cuDeviceGetAttribute,
  4199. + * ::cuDeviceGetCount,
  4200. + * ::cuDeviceGet,
  4201. + * ::cuDeviceGetProperties,
  4202. + * ::cuDeviceTotalMem
  4203. + */
  4204. +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
  4205. +
  4206. +/**
  4207. + * \brief Returns the compute capability of the device
  4208. + *
  4209. + * Returns in \p *major and \p *minor the major and minor revision numbers that
  4210. + * define the compute capability of the device \p dev.
  4211. + *
  4212. + * \param major - Major revision number
  4213. + * \param minor - Minor revision number
  4214. + * \param dev - Device handle
  4215. + *
  4216. + * \return
  4217. + * ::CUDA_SUCCESS,
  4218. + * ::CUDA_ERROR_DEINITIALIZED,
  4219. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4220. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4221. + * ::CUDA_ERROR_INVALID_VALUE,
  4222. + * ::CUDA_ERROR_INVALID_DEVICE
  4223. + * \notefnerr
  4224. + *
  4225. + * \sa
  4226. + * ::cuDeviceGetAttribute,
  4227. + * ::cuDeviceGetCount,
  4228. + * ::cuDeviceGetName,
  4229. + * ::cuDeviceGet,
  4230. + * ::cuDeviceGetProperties,
  4231. + * ::cuDeviceTotalMem
  4232. + */
  4233. +CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
  4234. +
  4235. +#if __CUDA_API_VERSION >= 3020
  4236. +/**
  4237. + * \brief Returns the total amount of memory on the device
  4238. + *
  4239. + * Returns in \p *bytes the total amount of memory available on the device
  4240. + * \p dev in bytes.
  4241. + *
  4242. + * \param bytes - Returned memory available on device in bytes
  4243. + * \param dev - Device handle
  4244. + *
  4245. + * \return
  4246. + * ::CUDA_SUCCESS,
  4247. + * ::CUDA_ERROR_DEINITIALIZED,
  4248. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4249. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4250. + * ::CUDA_ERROR_INVALID_VALUE,
  4251. + * ::CUDA_ERROR_INVALID_DEVICE
  4252. + * \notefnerr
  4253. + *
  4254. + * \sa ::cuDeviceComputeCapability,
  4255. + * ::cuDeviceGetAttribute,
  4256. + * ::cuDeviceGetCount,
  4257. + * ::cuDeviceGetName,
  4258. + * ::cuDeviceGet,
  4259. + * ::cuDeviceGetProperties,
  4260. + */
  4261. +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
  4262. +#endif /* __CUDA_API_VERSION >= 3020 */
  4263. +
  4264. +/**
  4265. + * \brief Returns properties for a selected device
  4266. + *
  4267. + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
  4268. + * structure is defined as:
  4269. + *
  4270. + * \code
  4271. + typedef struct CUdevprop_st {
  4272. + int maxThreadsPerBlock;
  4273. + int maxThreadsDim[3];
  4274. + int maxGridSize[3];
  4275. + int sharedMemPerBlock;
  4276. + int totalConstantMemory;
  4277. + int SIMDWidth;
  4278. + int memPitch;
  4279. + int regsPerBlock;
  4280. + int clockRate;
  4281. + int textureAlign
  4282. + } CUdevprop;
  4283. + * \endcode
  4284. + * where:
  4285. + *
  4286. + * - ::maxThreadsPerBlock is the maximum number of threads per block;
  4287. + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
  4288. + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
  4289. + * - ::sharedMemPerBlock is the total amount of shared memory available per
  4290. + * block in bytes;
  4291. + * - ::totalConstantMemory is the total amount of constant memory available on
  4292. + * the device in bytes;
  4293. + * - ::SIMDWidth is the warp size;
  4294. + * - ::memPitch is the maximum pitch allowed by the memory copy functions that
  4295. + * involve memory regions allocated through ::cuMemAllocPitch();
  4296. + * - ::regsPerBlock is the total number of registers available per block;
  4297. + * - ::clockRate is the clock frequency in kilohertz;
  4298. + * - ::textureAlign is the alignment requirement; texture base addresses that
  4299. + * are aligned to ::textureAlign bytes do not need an offset applied to
  4300. + * texture fetches.
  4301. + *
  4302. + * \param prop - Returned properties of device
  4303. + * \param dev - Device to get properties for
  4304. + *
  4305. + * \return
  4306. + * ::CUDA_SUCCESS,
  4307. + * ::CUDA_ERROR_DEINITIALIZED,
  4308. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4309. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4310. + * ::CUDA_ERROR_INVALID_VALUE,
  4311. + * ::CUDA_ERROR_INVALID_DEVICE
  4312. + * \notefnerr
  4313. + *
  4314. + * \sa ::cuDeviceComputeCapability,
  4315. + * ::cuDeviceGetAttribute,
  4316. + * ::cuDeviceGetCount,
  4317. + * ::cuDeviceGetName,
  4318. + * ::cuDeviceGet,
  4319. + * ::cuDeviceTotalMem
  4320. + */
  4321. +CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
  4322. +
  4323. +/**
  4324. + * \brief Returns information about the device
  4325. + *
  4326. + * Returns in \p *pi the integer value of the attribute \p attrib on device
  4327. + * \p dev. The supported attributes are:
  4328. + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
  4329. + * block;
  4330. + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
  4331. + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
  4332. + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
  4333. + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
  4334. + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
  4335. + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
  4336. + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
  4337. + * shared memory available to a thread block in bytes; this amount is shared
  4338. + * by all thread blocks simultaneously resident on a multiprocessor;
  4339. + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
  4340. + * __constant__ variables in a CUDA C kernel in bytes;
  4341. + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
  4342. + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
  4343. + * memory copy functions that involve memory regions allocated through
  4344. + * ::cuMemAllocPitch();
  4345. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
  4346. + * texture width;
  4347. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
  4348. + * for a 1D texture bound to linear memory;
  4349. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
  4350. + * texture width;
  4351. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
  4352. + * texture height;
  4353. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
  4354. + * for a 2D texture bound to linear memory;
  4355. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
  4356. + * for a 2D texture bound to linear memory;
  4357. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
  4358. + * in bytes for a 2D texture bound to linear memory;
  4359. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
  4360. + * texture width;
  4361. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
  4362. + * texture height;
  4363. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
  4364. + * texture depth;
  4365. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
  4366. + * Alternate maximum 3D texture width, 0 if no alternate
  4367. + * maximum 3D texture size is supported;
  4368. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
  4369. + * Alternate maximum 3D texture height, 0 if no alternate
  4370. + * maximum 3D texture size is supported;
  4371. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
  4372. + * Alternate maximum 3D texture depth, 0 if no alternate
  4373. + * maximum 3D texture size is supported;
  4374. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
  4375. + * Maximum cubemap texture width or height;
  4376. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
  4377. + * Maximum 1D layered texture width;
  4378. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
  4379. + * Maximum layers in a 1D layered texture;
  4380. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
  4381. + * Maximum 2D layered texture width;
  4382. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
  4383. + * Maximum 2D layered texture height;
  4384. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
  4385. + * Maximum layers in a 2D layered texture;
  4386. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
  4387. + * Maximum cubemap layered texture width or height;
  4388. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
  4389. + * Maximum layers in a cubemap layered texture;
  4390. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
  4391. + * Maximum 1D surface width;
  4392. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
  4393. + * Maximum 2D surface width;
  4394. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
  4395. + * Maximum 2D surface height;
  4396. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
  4397. + * Maximum 3D surface width;
  4398. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
  4399. + * Maximum 3D surface height;
  4400. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
  4401. + * Maximum 3D surface depth;
  4402. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
  4403. + * Maximum 1D layered surface width;
  4404. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
  4405. + * Maximum layers in a 1D layered surface;
  4406. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
  4407. + * Maximum 2D layered surface width;
  4408. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
  4409. + * Maximum 2D layered surface height;
  4410. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
  4411. + * Maximum layers in a 2D layered surface;
  4412. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
  4413. + * Maximum cubemap surface width;
  4414. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
  4415. + * Maximum cubemap layered surface width;
  4416. + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
  4417. + * Maximum layers in a cubemap layered surface;
  4418. + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
  4419. + * registers available to a thread block; this number is shared by all thread
  4420. + * blocks simultaneously resident on a multiprocessor;
  4421. + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: Peak clock frequency in kilohertz;
  4422. + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
  4423. + * base addresses aligned to ::textureAlign bytes do not need an offset
  4424. + * applied to texture fetches;
  4425. + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
  4426. + * for 2D texture references bound to pitched memory;
  4427. + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
  4428. + * memory between host and device while executing a kernel, or 0 if not;
  4429. + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
  4430. + * the device;
  4431. + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
  4432. + * for kernels executed on the device, or 0 if not;
  4433. + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
  4434. + * memory subsystem, or 0 if not;
  4435. + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
  4436. + * memory into the CUDA address space, or 0 if not;
  4437. + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
  4438. + * in. Available modes are as follows:
  4439. + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
  4440. + * can have multiple CUDA contexts present at a single time.
  4441. + * - ::CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have
  4442. + * only one CUDA context present on it at a time.
  4443. + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
  4444. + * prohibited from creating new CUDA contexts.
  4445. + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device
  4446. + * can have only one context used by a single process at a time.
  4447. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
  4448. + * executing multiple kernels within the same context simultaneously, or 0 if
  4449. + * not. It is not guaranteed that multiple kernels will be resident
  4450. + * on the device concurrently so this feature should not be relied upon for
  4451. + * correctness;
  4452. + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
  4453. + * device, 0 if error correction is disabled or not supported by the device;
  4454. + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
  4455. + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
  4456. + * of the device;
  4457. + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
  4458. + * is only available on Tesla hardware running Windows Vista or later;
  4459. + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
  4460. + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
  4461. + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
  4462. + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
  4463. + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
  4464. + * the host, or 0 if not;
  4465. + *
  4466. + * \param pi - Returned device attribute value
  4467. + * \param attrib - Device attribute to query
  4468. + * \param dev - Device handle
  4469. + *
  4470. + * \return
  4471. + * ::CUDA_SUCCESS,
  4472. + * ::CUDA_ERROR_DEINITIALIZED,
  4473. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4474. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4475. + * ::CUDA_ERROR_INVALID_VALUE,
  4476. + * ::CUDA_ERROR_INVALID_DEVICE
  4477. + * \notefnerr
  4478. + *
  4479. + * \sa ::cuDeviceComputeCapability,
  4480. + * ::cuDeviceGetCount,
  4481. + * ::cuDeviceGetName,
  4482. + * ::cuDeviceGet,
  4483. + * ::cuDeviceGetProperties,
  4484. + * ::cuDeviceTotalMem
  4485. + */
  4486. +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
  4487. +
  4488. +/** @} */ /* END CUDA_DEVICE */
  4489. +
  4490. +
  4491. +/**
  4492. + * \defgroup CUDA_CTX Context Management
  4493. + *
  4494. + * This section describes the context management functions of the low-level
  4495. + * CUDA driver application programming interface.
  4496. + *
  4497. + * @{
  4498. + */
  4499. +
  4500. +#if __CUDA_API_VERSION >= 3020
  4501. +/**
  4502. + * \brief Create a CUDA context
  4503. + *
  4504. + * Creates a new CUDA context and associates it with the calling thread. The
  4505. + * \p flags parameter is described below. The context is created with a usage
  4506. + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
  4507. + * when done using the context. If a context is already current to the thread,
  4508. + * it is supplanted by the newly created context and may be restored by a subsequent
  4509. + * call to ::cuCtxPopCurrent().
  4510. + *
  4511. + * The three LSBs of the \p flags parameter can be used to control how the OS
  4512. + * thread, which owns the CUDA context at the time of an API call, interacts
  4513. + * with the OS scheduler when waiting for results from the GPU. Only one of
  4514. + * the scheduling flags can be set when creating a context.
  4515. + *
  4516. + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
  4517. + * uses a heuristic based on the number of active CUDA contexts in the
  4518. + * process \e C and the number of logical processors in the system \e P. If
  4519. + * \e C > \e P, then CUDA will yield to other OS threads when waiting for
  4520. + * the GPU, otherwise CUDA will not yield while waiting for results and
  4521. + * actively spin on the processor.
  4522. + *
  4523. + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
  4524. + * results from the GPU. This can decrease latency when waiting for the GPU,
  4525. + * but may lower the performance of CPU threads if they are performing work in
  4526. + * parallel with the CUDA thread.
  4527. + *
  4528. + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
  4529. + * results from the GPU. This can increase latency when waiting for the GPU,
  4530. + * but can increase the performance of CPU threads performing work in parallel
  4531. + * with the GPU.
  4532. + *
  4533. + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
  4534. + * synchronization primitive when waiting for the GPU to finish work.
  4535. + *
  4536. + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
  4537. + * synchronization primitive when waiting for the GPU to finish work. <br>
  4538. + * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
  4539. + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
  4540. + *
  4541. + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
  4542. + * This flag must be set in order to allocate pinned host memory that is
  4543. + * accessible to the GPU.
  4544. + *
  4545. + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
  4546. + * after resizing local memory for a kernel. This can prevent thrashing by
  4547. + * local memory allocations when launching many kernels with high local
  4548. + * memory usage at the cost of potentially increased memory usage.
  4549. + *
  4550. + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
  4551. + * the device is ::CU_COMPUTEMODE_PROHIBITED. Similarly, context creation will
  4552. + * also fail with ::CUDA_ERROR_UNKNOWN if the compute mode for the device is
  4553. + * set to ::CU_COMPUTEMODE_EXCLUSIVE and there is already an active context on
  4554. + * the device. The function ::cuDeviceGetAttribute() can be used with
  4555. + * ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the
  4556. + * device. The <i>nvidia-smi</i> tool can be used to set the compute mode for
  4557. + * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
  4558. + * -h option to it.
  4559. + *
  4560. + * \param pctx - Returned context handle of the new context
  4561. + * \param flags - Context creation flags
  4562. + * \param dev - Device to create context on
  4563. + *
  4564. + * \return
  4565. + * ::CUDA_SUCCESS,
  4566. + * ::CUDA_ERROR_DEINITIALIZED,
  4567. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4568. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4569. + * ::CUDA_ERROR_INVALID_DEVICE,
  4570. + * ::CUDA_ERROR_INVALID_VALUE,
  4571. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  4572. + * ::CUDA_ERROR_UNKNOWN
  4573. + * \notefnerr
  4574. + *
  4575. + * \sa ::cuCtxDestroy,
  4576. + * ::cuCtxGetApiVersion,
  4577. + * ::cuCtxGetCacheConfig,
  4578. + * ::cuCtxGetDevice,
  4579. + * ::cuCtxGetLimit,
  4580. + * ::cuCtxPopCurrent,
  4581. + * ::cuCtxPushCurrent,
  4582. + * ::cuCtxSetCacheConfig,
  4583. + * ::cuCtxSetLimit,
  4584. + * ::cuCtxSynchronize
  4585. + */
  4586. +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
  4587. +#endif /* __CUDA_API_VERSION >= 3020 */
  4588. +
  4589. +#if __CUDA_API_VERSION >= 4000
  4590. +/**
  4591. + * \brief Destroy a CUDA context
  4592. + *
  4593. + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be
  4594. + * destroyed regardless of how many threads it is current to.
  4595. + * It is the responsibility of the calling function to ensure that no API
  4596. + * call issues using \p ctx while ::cuCtxDestroy() is executing.
  4597. + *
  4598. + * If \p ctx is current to the calling thread then \p ctx will also be
  4599. + * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
  4600. + * were called). If \p ctx is current to other threads, then \p ctx will
  4601. + * remain current to those threads, and attempting to access \p ctx from
  4602. + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
  4603. + *
  4604. + * \param ctx - Context to destroy
  4605. + *
  4606. + * \return
  4607. + * ::CUDA_SUCCESS,
  4608. + * ::CUDA_ERROR_DEINITIALIZED,
  4609. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4610. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4611. + * ::CUDA_ERROR_INVALID_VALUE
  4612. + * \notefnerr
  4613. + *
  4614. + * \sa ::cuCtxCreate,
  4615. + * ::cuCtxGetApiVersion,
  4616. + * ::cuCtxGetCacheConfig,
  4617. + * ::cuCtxGetDevice,
  4618. + * ::cuCtxGetLimit,
  4619. + * ::cuCtxPopCurrent,
  4620. + * ::cuCtxPushCurrent,
  4621. + * ::cuCtxSetCacheConfig,
  4622. + * ::cuCtxSetLimit,
  4623. + * ::cuCtxSynchronize
  4624. + */
  4625. +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
  4626. +#endif /* __CUDA_API_VERSION >= 4000 */
  4627. +
  4628. +/**
  4629. + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
  4630. + *
  4631. + * This section describes the deprecated context management functions of the low-level
  4632. + * CUDA driver application programming interface.
  4633. + *
  4634. + * @{
  4635. + */
  4636. +
  4637. +/**
  4638. + * \brief Increment a context's usage-count
  4639. + *
  4640. + * \deprecated
  4641. + *
  4642. + * Note that this function is deprecated and should not be used.
  4643. + *
  4644. + * Increments the usage count of the context and passes back a context handle
  4645. + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
  4646. + * done with the context. ::cuCtxAttach() fails if there is no context current
  4647. + * to the thread.
  4648. + *
  4649. + * Currently, the \p flags parameter must be 0.
  4650. + *
  4651. + * \param pctx - Returned context handle of the current context
  4652. + * \param flags - Context attach flags (must be 0)
  4653. + *
  4654. + * \return
  4655. + * ::CUDA_SUCCESS,
  4656. + * ::CUDA_ERROR_DEINITIALIZED,
  4657. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4658. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4659. + * ::CUDA_ERROR_INVALID_VALUE
  4660. + * \notefnerr
  4661. + *
  4662. + * \sa ::cuCtxCreate,
  4663. + * ::cuCtxDestroy,
  4664. + * ::cuCtxDetach,
  4665. + * ::cuCtxGetApiVersion,
  4666. + * ::cuCtxGetCacheConfig,
  4667. + * ::cuCtxGetDevice,
  4668. + * ::cuCtxGetLimit,
  4669. + * ::cuCtxPopCurrent,
  4670. + * ::cuCtxPushCurrent,
  4671. + * ::cuCtxSetCacheConfig,
  4672. + * ::cuCtxSetLimit,
  4673. + * ::cuCtxSynchronize
  4674. + */
  4675. +CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
  4676. +
  4677. +/**
  4678. + * \brief Decrement a context's usage-count
  4679. + *
  4680. + * \deprecated
  4681. + *
  4682. + * Note that this function is deprecated and should not be used.
  4683. + *
  4684. + * Decrements the usage count of the context \p ctx, and destroys the context
  4685. + * if the usage count goes to 0. The context must be a handle that was passed
  4686. + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
  4687. + * calling thread.
  4688. + *
  4689. + * \param ctx - Context to destroy
  4690. + *
  4691. + * \return
  4692. + * ::CUDA_SUCCESS,
  4693. + * ::CUDA_ERROR_DEINITIALIZED,
  4694. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4695. + * ::CUDA_ERROR_INVALID_CONTEXT
  4696. + * \notefnerr
  4697. + *
  4698. + * \sa ::cuCtxCreate,
  4699. + * ::cuCtxDestroy,
  4700. + * ::cuCtxGetApiVersion,
  4701. + * ::cuCtxGetCacheConfig,
  4702. + * ::cuCtxGetDevice,
  4703. + * ::cuCtxGetLimit,
  4704. + * ::cuCtxPopCurrent,
  4705. + * ::cuCtxPushCurrent,
  4706. + * ::cuCtxSetCacheConfig,
  4707. + * ::cuCtxSetLimit,
  4708. + * ::cuCtxSynchronize
  4709. + */
  4710. +CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
  4711. +
  4712. +/** @} */ /* END CUDA_CTX_DEPRECATED */
  4713. +
  4714. +#if __CUDA_API_VERSION >= 4000
  4715. +/**
  4716. + * \brief Pushes a context on the current CPU thread
  4717. + *
  4718. + * Pushes the given context \p ctx onto the CPU thread's stack of current
  4719. + * contexts. The specified context becomes the CPU thread's current context, so
  4720. + * all CUDA functions that operate on the current context are affected.
  4721. + *
  4722. + * The previous current context may be made current again by calling
  4723. + * ::cuCtxDestroy() or ::cuCtxPopCurrent().
  4724. + *
  4725. + * \param ctx - Context to push
  4726. + *
  4727. + * \return
  4728. + * ::CUDA_SUCCESS,
  4729. + * ::CUDA_ERROR_DEINITIALIZED,
  4730. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4731. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4732. + * ::CUDA_ERROR_INVALID_VALUE
  4733. + * \notefnerr
  4734. + *
  4735. + * \sa ::cuCtxCreate,
  4736. + * ::cuCtxDestroy,
  4737. + * ::cuCtxGetApiVersion,
  4738. + * ::cuCtxGetCacheConfig,
  4739. + * ::cuCtxGetDevice,
  4740. + * ::cuCtxGetLimit,
  4741. + * ::cuCtxPopCurrent,
  4742. + * ::cuCtxSetCacheConfig,
  4743. + * ::cuCtxSetLimit,
  4744. + * ::cuCtxSynchronize
  4745. + */
  4746. +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
  4747. +
  4748. +/**
  4749. + * \brief Pops the current CUDA context from the current CPU thread.
  4750. + *
  4751. + * Pops the current CUDA context from the CPU thread and passes back the
  4752. + * old context handle in \p *pctx. That context may then be made current
  4753. + * to a different CPU thread by calling ::cuCtxPushCurrent().
  4754. + *
  4755. + * If a context was current to the CPU thread before ::cuCtxCreate() or
  4756. + * ::cuCtxPushCurrent() was called, this function makes that context current to
  4757. + * the CPU thread again.
  4758. + *
  4759. + * \param pctx - Returned new context handle
  4760. + *
  4761. + * \return
  4762. + * ::CUDA_SUCCESS,
  4763. + * ::CUDA_ERROR_DEINITIALIZED,
  4764. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4765. + * ::CUDA_ERROR_INVALID_CONTEXT
  4766. + * \notefnerr
  4767. + *
  4768. + * \sa ::cuCtxCreate,
  4769. + * ::cuCtxDestroy,
  4770. + * ::cuCtxGetApiVersion,
  4771. + * ::cuCtxGetCacheConfig,
  4772. + * ::cuCtxGetDevice,
  4773. + * ::cuCtxGetLimit,
  4774. + * ::cuCtxPushCurrent,
  4775. + * ::cuCtxSetCacheConfig,
  4776. + * ::cuCtxSetLimit,
  4777. + * ::cuCtxSynchronize
  4778. + */
  4779. +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
  4780. +
  4781. +/**
  4782. + * \brief Binds the specified CUDA context to the calling CPU thread
  4783. + *
  4784. + * Binds the specified CUDA context to the calling CPU thread.
  4785. + * If \p ctx is NULL then the CUDA context previously bound to the
  4786. + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
  4787. + *
  4788. + * If there exists a CUDA context stack on the calling CPU thread, this
  4789. + * will replace the top of that stack with \p ctx.
  4790. + * If \p ctx is NULL then this will be equivalent to popping the top
  4791. + * of the calling CPU thread's CUDA context stack (or a no-op if the
  4792. + * calling CPU thread's CUDA context stack is empty).
  4793. + *
  4794. + * \param ctx - Context to bind to the calling CPU thread
  4795. + *
  4796. + * \return
  4797. + * ::CUDA_SUCCESS,
  4798. + * ::CUDA_ERROR_DEINITIALIZED,
  4799. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4800. + * ::CUDA_ERROR_INVALID_CONTEXT
  4801. + * \notefnerr
  4802. + *
  4803. + * \sa ::cuCtxGetCurrent, ::cuCtxCreate, ::cuCtxDestroy
  4804. + */
  4805. +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
  4806. +
  4807. +/**
  4808. + * \brief Returns the CUDA context bound to the calling CPU thread.
  4809. + *
  4810. + * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
  4811. + * If no context is bound to the calling CPU thread then \p *pctx is
  4812. + * set to NULL and ::CUDA_SUCCESS is returned.
  4813. + *
  4814. + * \param pctx - Returned context handle
  4815. + *
  4816. + * \return
  4817. + * ::CUDA_SUCCESS,
  4818. + * ::CUDA_ERROR_DEINITIALIZED,
  4819. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4820. + * \notefnerr
  4821. + *
  4822. + * \sa ::cuCtxSetCurrent, ::cuCtxCreate, ::cuCtxDestroy
  4823. + */
  4824. +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
  4825. +#endif /* __CUDA_API_VERSION >= 4000 */
  4826. +
  4827. +/**
  4828. + * \brief Returns the device ID for the current context
  4829. + *
  4830. + * Returns in \p *device the ordinal of the current context's device.
  4831. + *
  4832. + * \param device - Returned device ID for the current context
  4833. + *
  4834. + * \return
  4835. + * ::CUDA_SUCCESS,
  4836. + * ::CUDA_ERROR_DEINITIALIZED,
  4837. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4838. + * ::CUDA_ERROR_INVALID_CONTEXT,
  4839. + * ::CUDA_ERROR_INVALID_VALUE,
  4840. + * \notefnerr
  4841. + *
  4842. + * \sa ::cuCtxCreate,
  4843. + * ::cuCtxDestroy,
  4844. + * ::cuCtxGetApiVersion,
  4845. + * ::cuCtxGetCacheConfig,
  4846. + * ::cuCtxGetLimit,
  4847. + * ::cuCtxPopCurrent,
  4848. + * ::cuCtxPushCurrent,
  4849. + * ::cuCtxSetCacheConfig,
  4850. + * ::cuCtxSetLimit,
  4851. + * ::cuCtxSynchronize
  4852. + */
  4853. +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
  4854. +
  4855. +/**
  4856. + * \brief Block for a context's tasks to complete
  4857. + *
  4858. + * Blocks until the device has completed all preceding requested tasks.
  4859. + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
  4860. + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
  4861. + * CPU thread will block until the GPU context has finished its work.
  4862. + *
  4863. + * \return
  4864. + * ::CUDA_SUCCESS,
  4865. + * ::CUDA_ERROR_DEINITIALIZED,
  4866. + * ::CUDA_ERROR_NOT_INITIALIZED,
  4867. + * ::CUDA_ERROR_INVALID_CONTEXT
  4868. + * \notefnerr
  4869. + *
  4870. + * \sa ::cuCtxCreate,
  4871. + * ::cuCtxDestroy,
  4872. + * ::cuCtxGetApiVersion,
  4873. + * ::cuCtxGetCacheConfig,
  4874. + * ::cuCtxGetDevice,
  4875. + * ::cuCtxGetLimit,
  4876. + * ::cuCtxPopCurrent,
  4877. + * ::cuCtxPushCurrent
  4878. + * ::cuCtxSetCacheConfig,
  4879. + * ::cuCtxSetLimit
  4880. + */
  4881. +CUresult CUDAAPI cuCtxSynchronize(void);
  4882. +
  4883. +/**
  4884. + * \brief Set resource limits
  4885. + *
  4886. + * Setting \p limit to \p value is a request by the application to update
  4887. + * the current limit maintained by the context. The driver is free to
  4888. + * modify the requested value to meet h/w requirements (this could be
  4889. + * clamping to minimum or maximum values, rounding up to nearest element
  4890. + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
  4891. + * what the limit has been set to.
  4892. + *
  4893. + * Setting each ::CUlimit has its own specific restrictions, so each is
  4894. + * discussed here.
  4895. + *
  4896. + * - ::CU_LIMIT_STACK_SIZE controls the stack size of each GPU thread.
  4897. + * This limit is only applicable to devices of compute capability
  4898. + * 2.0 and higher. Attempting to set this limit on devices of
  4899. + * compute capability less than 2.0 will result in the error
  4900. + * ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
  4901. + *
  4902. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size of the FIFO used
  4903. + * by the ::printf() device system call. Setting
  4904. + * ::CU_LIMIT_PRINTF_FIFO_SIZE must be performed before launching any
  4905. + * kernel that uses the ::printf() device system call, otherwise
  4906. + * ::CUDA_ERROR_INVALID_VALUE will be returned.
  4907. + * This limit is only applicable to devices of compute capability
  4908. + * 2.0 and higher. Attempting to set this limit on devices of
  4909. + * compute capability less than 2.0 will result in the error
  4910. + * ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
  4911. + *
  4912. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size of the heap used
  4913. + * by the ::malloc() and ::free() device system calls. Setting
  4914. + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching
  4915. + * any kernel that uses the ::malloc() or ::free() device system calls,
  4916. + * otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
  4917. + * This limit is only applicable to devices of compute capability
  4918. + * 2.0 and higher. Attempting to set this limit on devices of
  4919. + * compute capability less than 2.0 will result in the error
  4920. + * ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
  4921. + *
  4922. + * \param limit - Limit to set
  4923. + * \param value - Size in bytes of limit
  4924. + *
  4925. + * \return
  4926. + * ::CUDA_SUCCESS,
  4927. + * ::CUDA_ERROR_INVALID_VALUE,
  4928. + * ::CUDA_ERROR_UNSUPPORTED_LIMIT
  4929. + * \notefnerr
  4930. + *
  4931. + * \sa ::cuCtxCreate,
  4932. + * ::cuCtxDestroy,
  4933. + * ::cuCtxGetApiVersion,
  4934. + * ::cuCtxGetCacheConfig,
  4935. + * ::cuCtxGetDevice,
  4936. + * ::cuCtxGetLimit,
  4937. + * ::cuCtxPopCurrent,
  4938. + * ::cuCtxPushCurrent,
  4939. + * ::cuCtxSetCacheConfig,
  4940. + * ::cuCtxSynchronize
  4941. + */
  4942. +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
  4943. +
  4944. +/**
  4945. + * \brief Returns resource limits
  4946. + *
  4947. + * Returns in \p *pvalue the current size of \p limit. The supported
  4948. + * ::CUlimit values are:
  4949. + * - ::CU_LIMIT_STACK_SIZE: stack size of each GPU thread;
  4950. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size of the FIFO used by the
  4951. + * ::printf() device system call.
  4952. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size of the heap used by the
  4953. + * ::malloc() and ::free() device system calls;
  4954. + *
  4955. + * \param limit - Limit to query
  4956. + * \param pvalue - Returned size in bytes of limit
  4957. + *
  4958. + * \return
  4959. + * ::CUDA_SUCCESS,
  4960. + * ::CUDA_ERROR_INVALID_VALUE,
  4961. + * ::CUDA_ERROR_UNSUPPORTED_LIMIT
  4962. + * \notefnerr
  4963. + *
  4964. + * \sa ::cuCtxCreate,
  4965. + * ::cuCtxDestroy,
  4966. + * ::cuCtxGetApiVersion,
  4967. + * ::cuCtxGetCacheConfig,
  4968. + * ::cuCtxGetDevice,
  4969. + * ::cuCtxPopCurrent,
  4970. + * ::cuCtxPushCurrent,
  4971. + * ::cuCtxSetCacheConfig,
  4972. + * ::cuCtxSetLimit,
  4973. + * ::cuCtxSynchronize
  4974. + */
  4975. +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
  4976. +
  4977. +/**
  4978. + * \brief Returns the preferred cache configuration for the current context.
  4979. + *
  4980. + * On devices where the L1 cache and shared memory use the same hardware
  4981. + * resources, this function returns through \p pconfig the preferred cache configuration
  4982. + * for the current context. This is only a preference. The driver will use
  4983. + * the requested configuration if possible, but it is free to choose a different
  4984. + * configuration if required to execute functions.
  4985. + *
  4986. + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
  4987. + * where the size of the L1 cache and shared memory are fixed.
  4988. + *
  4989. + * The supported cache configurations are:
  4990. + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
  4991. + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
  4992. + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
  4993. + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
  4994. + *
  4995. + * \param pconfig - Returned cache configuration
  4996. + *
  4997. + * \return
  4998. + * ::CUDA_SUCCESS,
  4999. + * ::CUDA_ERROR_DEINITIALIZED,
  5000. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5001. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5002. + * ::CUDA_ERROR_INVALID_VALUE
  5003. + * \notefnerr
  5004. + *
  5005. + * \sa ::cuCtxCreate,
  5006. + * ::cuCtxDestroy,
  5007. + * ::cuCtxGetApiVersion,
  5008. + * ::cuCtxGetDevice,
  5009. + * ::cuCtxGetLimit,
  5010. + * ::cuCtxPopCurrent,
  5011. + * ::cuCtxPushCurrent,
  5012. + * ::cuCtxSetCacheConfig,
  5013. + * ::cuCtxSetLimit,
  5014. + * ::cuCtxSynchronize,
  5015. + * ::cuFuncSetCacheConfig
  5016. + */
  5017. +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
  5018. +
  5019. +/**
  5020. + * \brief Sets the preferred cache configuration for the current context.
  5021. + *
  5022. + * On devices where the L1 cache and shared memory use the same hardware
  5023. + * resources, this sets through \p config the preferred cache configuration for
  5024. + * the current context. This is only a preference. The driver will use
  5025. + * the requested configuration if possible, but it is free to choose a different
  5026. + * configuration if required to execute the function. Any function preference
  5027. + * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
  5028. + * setting. Setting the context-wide cache configuration to
  5029. + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
  5030. + * to not change the cache configuration unless required to launch the kernel.
  5031. + *
  5032. + * This setting does nothing on devices where the size of the L1 cache and
  5033. + * shared memory are fixed.
  5034. + *
  5035. + * Launching a kernel with a different preference than the most recent
  5036. + * preference setting may insert a device-side synchronization point.
  5037. + *
  5038. + * The supported cache configurations are:
  5039. + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
  5040. + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
  5041. + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
  5042. + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
  5043. + *
  5044. + * \param config - Requested cache configuration
  5045. + *
  5046. + * \return
  5047. + * ::CUDA_SUCCESS,
  5048. + * ::CUDA_ERROR_DEINITIALIZED,
  5049. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5050. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5051. + * ::CUDA_ERROR_INVALID_VALUE
  5052. + * \notefnerr
  5053. + *
  5054. + * \sa ::cuCtxCreate,
  5055. + * ::cuCtxDestroy,
  5056. + * ::cuCtxGetApiVersion,
  5057. + * ::cuCtxGetCacheConfig,
  5058. + * ::cuCtxGetDevice,
  5059. + * ::cuCtxGetLimit,
  5060. + * ::cuCtxPopCurrent,
  5061. + * ::cuCtxPushCurrent,
  5062. + * ::cuCtxSetLimit,
  5063. + * ::cuCtxSynchronize,
  5064. + * ::cuFuncSetCacheConfig
  5065. + */
  5066. +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
  5067. +
  5068. +#if __CUDA_API_VERSION >= 4020
  5069. +/**
  5070. + * \brief Returns the current shared memory configuration for the current context.
  5071. + *
  5072. + * This function will return in \p pConfig the current size of shared memory banks
  5073. + * in the current context. On devices with configurable shared memory banks,
  5074. + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
  5075. + * subsequent kernel launches will by default use the new bank size. When
  5076. + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
  5077. + * memory, it will return the fixed bank size of the hardware.
  5078. + *
  5079. + * The returned bank configurations can be either:
  5080. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is
  5081. + * four bytes.
  5082. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
  5083. + * eight bytes.
  5084. + *
  5085. + * \param pConfig - returned shared memory configuration
  5086. + * \return
  5087. + * ::CUDA_SUCCESS,
  5088. + * ::CUDA_ERROR_DEINITIALIZED,
  5089. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5090. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5091. + * ::CUDA_ERROR_INVALID_VALUE
  5092. + * \notefnerr
  5093. + *
  5094. + * \sa ::cuCtxCreate,
  5095. + * ::cuCtxDestroy,
  5096. + * ::cuCtxGetApiVersion,
  5097. + * ::cuCtxGetCacheConfig,
  5098. + * ::cuCtxGetDevice,
  5099. + * ::cuCtxGetLimit,
  5100. + * ::cuCtxPopCurrent,
  5101. + * ::cuCtxPushCurrent,
  5102. + * ::cuCtxSetLimit,
  5103. + * ::cuCtxSynchronize,
  5104. + * ::cuCtxGetSharedMemConfig,
  5105. + * ::cuFuncSetCacheConfig,
  5106. + */
  5107. +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
  5108. +
  5109. +/**
  5110. + * \brief Sets the shared memory configuration for the current context.
  5111. + *
  5112. + * On devices with configurable shared memory banks, this function will set
  5113. + * the context's shared memory bank size which is used for subsequent kernel
  5114. + * launches.
  5115. + *
  5116. + * Changed the shared memory configuration between launches may insert a device
  5117. + * side synchronization point between those launches.
  5118. + *
  5119. + * Changing the shared memory bank size will not increase shared memory usage
  5120. + * or affect occupancy of kernels, but may have major effects on performance.
  5121. + * Larger bank sizes will allow for greater potential bandwidth to shared memory,
  5122. + * but will change what kinds of accesses to shared memory will result in bank
  5123. + * conflicts.
  5124. + *
  5125. + * This function will do nothing on devices with fixed shared memory bank size.
  5126. + *
  5127. + * The supported bank configurations are:
  5128. + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
  5129. + * setting (currently, four bytes).
  5130. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
  5131. + * be natively four bytes.
  5132. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
  5133. + * be natively eight bytes.
  5134. + *
  5135. + * \param config - requested shared memory configuration
  5136. + *
  5137. + * \return
  5138. + * ::CUDA_SUCCESS,
  5139. + * ::CUDA_ERROR_DEINITIALIZED,
  5140. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5141. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5142. + * ::CUDA_ERROR_INVALID_VALUE
  5143. + * \notefnerr
  5144. + *
  5145. + * \sa ::cuCtxCreate,
  5146. + * ::cuCtxDestroy,
  5147. + * ::cuCtxGetApiVersion,
  5148. + * ::cuCtxGetCacheConfig,
  5149. + * ::cuCtxGetDevice,
  5150. + * ::cuCtxGetLimit,
  5151. + * ::cuCtxPopCurrent,
  5152. + * ::cuCtxPushCurrent,
  5153. + * ::cuCtxSetLimit,
  5154. + * ::cuCtxSynchronize,
  5155. + * ::cuCtxGetSharedMemConfig,
  5156. + * ::cuFuncSetCacheConfig,
  5157. + */
  5158. +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
  5159. +#endif
  5160. +
  5161. +/**
  5162. + * \brief Gets the context's API version.
  5163. + *
  5164. + * Returns a version number in \p version corresponding to the capabilities of
  5165. + * the context (e.g. 3010 or 3020), which library developers can use to direct
  5166. + * callers to a specific API version. If \p ctx is NULL, returns the API version
  5167. + * used to create the currently bound context.
  5168. + *
  5169. + * Note that new API versions are only introduced when context capabilities are
  5170. + * changed that break binary compatibility, so the API version and driver version
  5171. + * may be different. For example, it is valid for the API version to be 3020 while
  5172. + * the driver version is 4010.
  5173. + *
  5174. + * \param ctx - Context to check
  5175. + * \param version - Pointer to version
  5176. + *
  5177. + * \return
  5178. + * ::CUDA_SUCCESS,
  5179. + * ::CUDA_ERROR_DEINITIALIZED,
  5180. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5181. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5182. + * ::CUDA_ERROR_UNKNOWN
  5183. + * \notefnerr
  5184. + *
  5185. + * \sa ::cuCtxCreate,
  5186. + * ::cuCtxDestroy,
  5187. + * ::cuCtxGetDevice,
  5188. + * ::cuCtxGetLimit,
  5189. + * ::cuCtxPopCurrent,
  5190. + * ::cuCtxPushCurrent,
  5191. + * ::cuCtxSetCacheConfig,
  5192. + * ::cuCtxSetLimit,
  5193. + * ::cuCtxSynchronize
  5194. + */
  5195. +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
  5196. +
  5197. +/** @} */ /* END CUDA_CTX */
  5198. +
  5199. +
  5200. +/**
  5201. + * \defgroup CUDA_MODULE Module Management
  5202. + *
  5203. + * This section describes the module management functions of the low-level CUDA
  5204. + * driver application programming interface.
  5205. + *
  5206. + * @{
  5207. + */
  5208. +
  5209. +/**
  5210. + * \brief Loads a compute module
  5211. + *
  5212. + * Takes a filename \p fname and loads the corresponding module \p module into
  5213. + * the current context. The CUDA driver API does not attempt to lazily
  5214. + * allocate the resources needed by a module; if the memory for functions and
  5215. + * data (constant and global) needed by the module cannot be allocated,
  5216. + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
  5217. + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
  5218. + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
  5219. + *
  5220. + * \param module - Returned module
  5221. + * \param fname - Filename of module to load
  5222. + *
  5223. + * \return
  5224. + * ::CUDA_SUCCESS,
  5225. + * ::CUDA_ERROR_DEINITIALIZED,
  5226. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5227. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5228. + * ::CUDA_ERROR_INVALID_VALUE,
  5229. + * ::CUDA_ERROR_NOT_FOUND,
  5230. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  5231. + * ::CUDA_ERROR_FILE_NOT_FOUND,
  5232. + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  5233. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  5234. + * \notefnerr
  5235. + *
  5236. + * \sa ::cuModuleGetFunction,
  5237. + * ::cuModuleGetGlobal,
  5238. + * ::cuModuleGetTexRef,
  5239. + * ::cuModuleLoadData,
  5240. + * ::cuModuleLoadDataEx,
  5241. + * ::cuModuleLoadFatBinary,
  5242. + * ::cuModuleUnload
  5243. + */
  5244. +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
  5245. +
  5246. +/**
  5247. + * \brief Load a module's data
  5248. + *
  5249. + * Takes a pointer \p image and loads the corresponding module \p module into
  5250. + * the current context. The pointer may be obtained by mapping a \e cubin or
  5251. + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
  5252. + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
  5253. + * object into the executable resources and using operating system calls such
  5254. + * as Windows \c FindResource() to obtain the pointer.
  5255. + *
  5256. + * \param module - Returned module
  5257. + * \param image - Module data to load
  5258. + *
  5259. + * \return
  5260. + * ::CUDA_SUCCESS,
  5261. + * ::CUDA_ERROR_DEINITIALIZED,
  5262. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5263. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5264. + * ::CUDA_ERROR_INVALID_VALUE,
  5265. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  5266. + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  5267. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  5268. + * \notefnerr
  5269. + *
  5270. + * \sa ::cuModuleGetFunction,
  5271. + * ::cuModuleGetGlobal,
  5272. + * ::cuModuleGetTexRef,
  5273. + * ::cuModuleLoad,
  5274. + * ::cuModuleLoadDataEx,
  5275. + * ::cuModuleLoadFatBinary,
  5276. + * ::cuModuleUnload
  5277. + */
  5278. +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
  5279. +
  5280. +/**
  5281. + * \brief Load a module's data with options
  5282. + *
  5283. + * Takes a pointer \p image and loads the corresponding module \p module into
  5284. + * the current context. The pointer may be obtained by mapping a \e cubin or
  5285. + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
  5286. + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
  5287. + * object into the executable resources and using operating system calls such
  5288. + * as Windows \c FindResource() to obtain the pointer. Options are passed as
  5289. + * an array via \p options and any corresponding parameters are passed in
  5290. + * \p optionValues. The number of total options is supplied via \p numOptions.
  5291. + * Any outputs will be returned via \p optionValues. Supported options are
  5292. + * (types for the option values are specified in parentheses after the option
  5293. + * name):
  5294. + *
  5295. + * - ::CU_JIT_MAX_REGISTERS: (unsigned int) input specifies the maximum number
  5296. + * of registers per thread;
  5297. + * - ::CU_JIT_THREADS_PER_BLOCK: (unsigned int) input specifies number of
  5298. + * threads per block to target compilation for; output returns the number of
  5299. + * threads the compiler actually targeted;
  5300. + * - ::CU_JIT_WALL_TIME: (float) output returns the float value of wall clock
  5301. + * time, in milliseconds, spent compiling the \e PTX code;
  5302. + * - ::CU_JIT_INFO_LOG_BUFFER: (char*) input is a pointer to a buffer in
  5303. + * which to print any informational log messages from \e PTX assembly (the
  5304. + * buffer size is specified via option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
  5305. + * - ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: (unsigned int) input is the size in
  5306. + * bytes of the buffer; output is the number of bytes filled with messages;
  5307. + * - ::CU_JIT_ERROR_LOG_BUFFER: (char*) input is a pointer to a buffer in
  5308. + * which to print any error log messages from \e PTX assembly (the buffer size
  5309. + * is specified via option ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
  5310. + * - ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: (unsigned int) input is the size in
  5311. + * bytes of the buffer; output is the number of bytes filled with messages;
  5312. + * - ::CU_JIT_OPTIMIZATION_LEVEL: (unsigned int) input is the level of
  5313. + * optimization to apply to generated code (0 - 4), with 4 being the default
  5314. + * and highest level;
  5315. + * - ::CU_JIT_TARGET_FROM_CUCONTEXT: (No option value) causes compilation
  5316. + * target to be determined based on current attached context (default);
  5317. + * - ::CU_JIT_TARGET: (unsigned int for enumerated type ::CUjit_target_enum)
  5318. + * input is the compilation target based on supplied ::CUjit_target_enum;
  5319. + * possible values are:
  5320. + * - ::CU_TARGET_COMPUTE_10
  5321. + * - ::CU_TARGET_COMPUTE_11
  5322. + * - ::CU_TARGET_COMPUTE_12
  5323. + * - ::CU_TARGET_COMPUTE_13
  5324. + * - ::CU_TARGET_COMPUTE_20
  5325. + * - ::CU_JIT_FALLBACK_STRATEGY: (unsigned int for enumerated type
  5326. + * ::CUjit_fallback_enum) chooses fallback strategy if matching cubin is not
  5327. + * found; possible values are:
  5328. + * - ::CU_PREFER_PTX
  5329. + * - ::CU_PREFER_BINARY
  5330. + *
  5331. + * \param module - Returned module
  5332. + * \param image - Module data to load
  5333. + * \param numOptions - Number of options
  5334. + * \param options - Options for JIT
  5335. + * \param optionValues - Option values for JIT
  5336. + *
  5337. + * \return
  5338. + * ::CUDA_SUCCESS,
  5339. + * ::CUDA_ERROR_DEINITIALIZED,
  5340. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5341. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5342. + * ::CUDA_ERROR_INVALID_VALUE,
  5343. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  5344. + * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  5345. + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  5346. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  5347. + * \notefnerr
  5348. + *
  5349. + * \sa ::cuModuleGetFunction,
  5350. + * ::cuModuleGetGlobal,
  5351. + * ::cuModuleGetTexRef,
  5352. + * ::cuModuleLoad,
  5353. + * ::cuModuleLoadData,
  5354. + * ::cuModuleLoadFatBinary,
  5355. + * ::cuModuleUnload
  5356. + */
  5357. +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
  5358. +
  5359. +/**
  5360. + * \brief Load a module's data
  5361. + *
  5362. + * Takes a pointer \p fatCubin and loads the corresponding module \p module
  5363. + * into the current context. The pointer represents a <i>fat binary</i> object,
  5364. + * which is a collection of different \e cubin and/or \e PTX files, all
  5365. + * representing the same device code, but compiled and optimized for different
  5366. + * architectures.
  5367. + *
  5368. + * Prior to CUDA 4.0, there was no documented API for constructing and using
  5369. + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary
  5370. + * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
  5371. + * More information can be found in the \b nvcc document.
  5372. + *
  5373. + * \param module - Returned module
  5374. + * \param fatCubin - Fat binary to load
  5375. + *
  5376. + * \return
  5377. + * ::CUDA_SUCCESS,
  5378. + * ::CUDA_ERROR_DEINITIALIZED,
  5379. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5380. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5381. + * ::CUDA_ERROR_INVALID_VALUE,
  5382. + * ::CUDA_ERROR_NOT_FOUND,
  5383. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  5384. + * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  5385. + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  5386. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  5387. + * \notefnerr
  5388. + *
  5389. + * \sa ::cuModuleGetFunction,
  5390. + * ::cuModuleGetGlobal,
  5391. + * ::cuModuleGetTexRef,
  5392. + * ::cuModuleLoad,
  5393. + * ::cuModuleLoadData,
  5394. + * ::cuModuleLoadDataEx,
  5395. + * ::cuModuleUnload
  5396. + */
  5397. +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
  5398. +
  5399. +/**
  5400. + * \brief Unloads a module
  5401. + *
  5402. + * Unloads a module \p hmod from the current context.
  5403. + *
  5404. + * \param hmod - Module to unload
  5405. + *
  5406. + * \return
  5407. + * ::CUDA_SUCCESS,
  5408. + * ::CUDA_ERROR_DEINITIALIZED,
  5409. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5410. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5411. + * ::CUDA_ERROR_INVALID_VALUE
  5412. + * \notefnerr
  5413. + *
  5414. + * \sa ::cuModuleGetFunction,
  5415. + * ::cuModuleGetGlobal,
  5416. + * ::cuModuleGetTexRef,
  5417. + * ::cuModuleLoad,
  5418. + * ::cuModuleLoadData,
  5419. + * ::cuModuleLoadDataEx,
  5420. + * ::cuModuleLoadFatBinary
  5421. + */
  5422. +CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
  5423. +
  5424. +/**
  5425. + * \brief Returns a function handle
  5426. + *
  5427. + * Returns in \p *hfunc the handle of the function of name \p name located in
  5428. + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
  5429. + * returns ::CUDA_ERROR_NOT_FOUND.
  5430. + *
  5431. + * \param hfunc - Returned function handle
  5432. + * \param hmod - Module to retrieve function from
  5433. + * \param name - Name of function to retrieve
  5434. + *
  5435. + * \return
  5436. + * ::CUDA_SUCCESS,
  5437. + * ::CUDA_ERROR_DEINITIALIZED,
  5438. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5439. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5440. + * ::CUDA_ERROR_INVALID_VALUE,
  5441. + * ::CUDA_ERROR_NOT_FOUND
  5442. + * \notefnerr
  5443. + *
  5444. + * \sa ::cuModuleGetGlobal,
  5445. + * ::cuModuleGetTexRef,
  5446. + * ::cuModuleLoad,
  5447. + * ::cuModuleLoadData,
  5448. + * ::cuModuleLoadDataEx,
  5449. + * ::cuModuleLoadFatBinary,
  5450. + * ::cuModuleUnload
  5451. + */
  5452. +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
  5453. +
  5454. +#if __CUDA_API_VERSION >= 3020
  5455. +/**
  5456. + * \brief Returns a global pointer from a module
  5457. + *
  5458. + * Returns in \p *dptr and \p *bytes the base pointer and size of the
  5459. + * global of name \p name located in module \p hmod. If no variable of that name
  5460. + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
  5461. + * parameters \p dptr and \p bytes are optional. If one of them is
  5462. + * NULL, it is ignored.
  5463. + *
  5464. + * \param dptr - Returned global device pointer
  5465. + * \param bytes - Returned global size in bytes
  5466. + * \param hmod - Module to retrieve global from
  5467. + * \param name - Name of global to retrieve
  5468. + *
  5469. + * \return
  5470. + * ::CUDA_SUCCESS,
  5471. + * ::CUDA_ERROR_DEINITIALIZED,
  5472. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5473. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5474. + * ::CUDA_ERROR_INVALID_VALUE,
  5475. + * ::CUDA_ERROR_NOT_FOUND
  5476. + * \notefnerr
  5477. + *
  5478. + * \sa ::cuModuleGetFunction,
  5479. + * ::cuModuleGetTexRef,
  5480. + * ::cuModuleLoad,
  5481. + * ::cuModuleLoadData,
  5482. + * ::cuModuleLoadDataEx,
  5483. + * ::cuModuleLoadFatBinary,
  5484. + * ::cuModuleUnload
  5485. + */
  5486. +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
  5487. +#endif /* __CUDA_API_VERSION >= 3020 */
  5488. +
  5489. +/**
  5490. + * \brief Returns a handle to a texture reference
  5491. + *
  5492. + * Returns in \p *pTexRef the handle of the texture reference of name \p name
  5493. + * in the module \p hmod. If no texture reference of that name exists,
  5494. + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
  5495. + * handle should not be destroyed, since it will be destroyed when the module
  5496. + * is unloaded.
  5497. + *
  5498. + * \param pTexRef - Returned texture reference
  5499. + * \param hmod - Module to retrieve texture reference from
  5500. + * \param name - Name of texture reference to retrieve
  5501. + *
  5502. + * \return
  5503. + * ::CUDA_SUCCESS,
  5504. + * ::CUDA_ERROR_DEINITIALIZED,
  5505. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5506. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5507. + * ::CUDA_ERROR_INVALID_VALUE,
  5508. + * ::CUDA_ERROR_NOT_FOUND
  5509. + * \notefnerr
  5510. + *
  5511. + * \sa ::cuModuleGetFunction,
  5512. + * ::cuModuleGetGlobal,
  5513. + * ::cuModuleGetSurfRef,
  5514. + * ::cuModuleLoad,
  5515. + * ::cuModuleLoadData,
  5516. + * ::cuModuleLoadDataEx,
  5517. + * ::cuModuleLoadFatBinary,
  5518. + * ::cuModuleUnload
  5519. + */
  5520. +CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
  5521. +
  5522. +/**
  5523. + * \brief Returns a handle to a surface reference
  5524. + *
  5525. + * Returns in \p *pSurfRef the handle of the surface reference of name \p name
  5526. + * in the module \p hmod. If no surface reference of that name exists,
  5527. + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
  5528. + *
  5529. + * \param pSurfRef - Returned surface reference
  5530. + * \param hmod - Module to retrieve surface reference from
  5531. + * \param name - Name of surface reference to retrieve
  5532. + *
  5533. + * \return
  5534. + * ::CUDA_SUCCESS,
  5535. + * ::CUDA_ERROR_DEINITIALIZED,
  5536. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5537. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5538. + * ::CUDA_ERROR_INVALID_VALUE,
  5539. + * ::CUDA_ERROR_NOT_FOUND
  5540. + * \notefnerr
  5541. + *
  5542. + * \sa ::cuModuleGetFunction,
  5543. + * ::cuModuleGetGlobal,
  5544. + * ::cuModuleGetTexRef,
  5545. + * ::cuModuleLoad,
  5546. + * ::cuModuleLoadData,
  5547. + * ::cuModuleLoadDataEx,
  5548. + * ::cuModuleLoadFatBinary,
  5549. + * ::cuModuleUnload
  5550. + */
  5551. +CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
  5552. +
  5553. +/** @} */ /* END CUDA_MODULE */
  5554. +
  5555. +
  5556. +/**
  5557. + * \defgroup CUDA_MEM Memory Management
  5558. + *
  5559. + * This section describes the memory management functions of the low-level CUDA
  5560. + * driver application programming interface.
  5561. + *
  5562. + * @{
  5563. + */
  5564. +
  5565. +#if __CUDA_API_VERSION >= 3020
  5566. +/**
  5567. + * \brief Gets free and total memory
  5568. + *
  5569. + * Returns in \p *free and \p *total respectively, the free and total amount of
  5570. + * memory available for allocation by the CUDA context, in bytes.
  5571. + *
  5572. + * \param free - Returned free memory in bytes
  5573. + * \param total - Returned total memory in bytes
  5574. + *
  5575. + * \return
  5576. + * ::CUDA_SUCCESS,
  5577. + * ::CUDA_ERROR_DEINITIALIZED,
  5578. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5579. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5580. + * ::CUDA_ERROR_INVALID_VALUE
  5581. + * \notefnerr
  5582. + *
  5583. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5584. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5585. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5586. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5587. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5588. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5589. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5590. + * ::cuMemGetAddressRange, ::cuMemHostAlloc,
  5591. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5592. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5593. + */
  5594. +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
  5595. +
  5596. +/**
  5597. + * \brief Allocates device memory
  5598. + *
  5599. + * Allocates \p bytesize bytes of linear memory on the device and returns in
  5600. + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
  5601. + * aligned for any kind of variable. The memory is not cleared. If \p bytesize
  5602. + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
  5603. + *
  5604. + * \param dptr - Returned device pointer
  5605. + * \param bytesize - Requested allocation size in bytes
  5606. + *
  5607. + * \return
  5608. + * ::CUDA_SUCCESS,
  5609. + * ::CUDA_ERROR_DEINITIALIZED,
  5610. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5611. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5612. + * ::CUDA_ERROR_INVALID_VALUE,
  5613. + * ::CUDA_ERROR_OUT_OF_MEMORY
  5614. + * \notefnerr
  5615. + *
  5616. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5617. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
  5618. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5619. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5620. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5621. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5622. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5623. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  5624. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5625. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5626. + */
  5627. +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
  5628. +
  5629. +/**
  5630. + * \brief Allocates pitched device memory
  5631. + *
  5632. + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
  5633. + * the device and returns in \p *dptr a pointer to the allocated memory. The
  5634. + * function may pad the allocation to ensure that corresponding pointers in
  5635. + * any given row will continue to meet the alignment requirements for
  5636. + * coalescing as the address is updated from row to row. \p ElementSizeBytes
  5637. + * specifies the size of the largest reads and writes that will be performed
  5638. + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
  5639. + * memory transactions are not possible on other data sizes). If
  5640. + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
  5641. + * the kernel will run correctly, but possibly at reduced speed. The pitch
  5642. + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
  5643. + * allocation. The intended usage of pitch is as a separate parameter of the
  5644. + * allocation, used to compute addresses within the 2D array. Given the row
  5645. + * and column of an array element of type \b T, the address is computed as:
  5646. + * \code
  5647. + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
  5648. + * \endcode
  5649. + *
  5650. + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
  5651. + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
  5652. + * recommended that programmers consider performing pitch allocations using
  5653. + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
  5654. + * especially true if the application will be performing 2D memory copies
  5655. + * between different regions of device memory (whether linear memory or CUDA
  5656. + * arrays).
  5657. + *
  5658. + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
  5659. + * to match or exceed the alignment requirement for texture binding with
  5660. + * ::cuTexRefSetAddress2D().
  5661. + *
  5662. + * \param dptr - Returned device pointer
  5663. + * \param pPitch - Returned pitch of allocation in bytes
  5664. + * \param WidthInBytes - Requested allocation width in bytes
  5665. + * \param Height - Requested allocation height in rows
  5666. + * \param ElementSizeBytes - Size of largest reads/writes for range
  5667. + *
  5668. + * \return
  5669. + * ::CUDA_SUCCESS,
  5670. + * ::CUDA_ERROR_DEINITIALIZED,
  5671. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5672. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5673. + * ::CUDA_ERROR_INVALID_VALUE,
  5674. + * ::CUDA_ERROR_OUT_OF_MEMORY
  5675. + * \notefnerr
  5676. + *
  5677. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5678. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5679. + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5680. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5681. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5682. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5683. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5684. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  5685. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5686. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5687. + */
  5688. +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
  5689. +
  5690. +/**
  5691. + * \brief Frees device memory
  5692. + *
  5693. + * Frees the memory space pointed to by \p dptr, which must have been returned
  5694. + * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
  5695. + *
  5696. + * \param dptr - Pointer to memory to free
  5697. + *
  5698. + * \return
  5699. + * ::CUDA_SUCCESS,
  5700. + * ::CUDA_ERROR_DEINITIALIZED,
  5701. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5702. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5703. + * ::CUDA_ERROR_INVALID_VALUE
  5704. + * \notefnerr
  5705. + *
  5706. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5707. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5708. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5709. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5710. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5711. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5712. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
  5713. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  5714. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5715. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5716. + */
  5717. +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
  5718. +
  5719. +/**
  5720. + * \brief Get information on memory allocations
  5721. + *
  5722. + * Returns the base address in \p *pbase and size in \p *psize of the
  5723. + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
  5724. + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
  5725. + * of them is NULL, it is ignored.
  5726. + *
  5727. + * \param pbase - Returned base address
  5728. + * \param psize - Returned size of device memory allocation
  5729. + * \param dptr - Device pointer to query
  5730. + *
  5731. + * \return
  5732. + * ::CUDA_SUCCESS,
  5733. + * ::CUDA_ERROR_DEINITIALIZED,
  5734. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5735. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5736. + * ::CUDA_ERROR_INVALID_VALUE
  5737. + * \notefnerr
  5738. + *
  5739. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5740. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5741. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5742. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5743. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5744. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5745. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5746. + * ::cuMemGetInfo, ::cuMemHostAlloc,
  5747. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5748. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5749. + */
  5750. +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
  5751. +
  5752. +/**
  5753. + * \brief Allocates page-locked host memory
  5754. + *
  5755. + * Allocates \p bytesize bytes of host memory that is page-locked and
  5756. + * accessible to the device. The driver tracks the virtual memory ranges
  5757. + * allocated with this function and automatically accelerates calls to
  5758. + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
  5759. + * the device, it can be read or written with much higher bandwidth than
  5760. + * pageable memory obtained with functions such as ::malloc(). Allocating
  5761. + * excessive amounts of memory with ::cuMemAllocHost() may degrade system
  5762. + * performance, since it reduces the amount of memory available to the system
  5763. + * for paging. As a result, this function is best used sparingly to allocate
  5764. + * staging areas for data exchange between host and device.
  5765. + *
  5766. + * Note all host memory allocated using ::cuMemHostAlloc() will automatically
  5767. + * be immediately accessible to all contexts on all devices which support unified
  5768. + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
  5769. + * The device pointer that may be used to access this host memory from those
  5770. + * contexts is always equal to the returned host pointer \p *pp.
  5771. + * See \ref CUDA_UNIFIED for additional details.
  5772. + *
  5773. + * \param pp - Returned host pointer to page-locked memory
  5774. + * \param bytesize - Requested allocation size in bytes
  5775. + *
  5776. + * \return
  5777. + * ::CUDA_SUCCESS,
  5778. + * ::CUDA_ERROR_DEINITIALIZED,
  5779. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5780. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5781. + * ::CUDA_ERROR_INVALID_VALUE,
  5782. + * ::CUDA_ERROR_OUT_OF_MEMORY
  5783. + * \notefnerr
  5784. + *
  5785. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5786. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
  5787. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5788. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5789. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5790. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5791. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5792. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  5793. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5794. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5795. + */
  5796. +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
  5797. +#endif /* __CUDA_API_VERSION >= 3020 */
  5798. +
  5799. +/**
  5800. + * \brief Frees page-locked host memory
  5801. + *
  5802. + * Frees the memory space pointed to by \p p, which must have been returned by
  5803. + * a previous call to ::cuMemAllocHost().
  5804. + *
  5805. + * \param p - Pointer to memory to free
  5806. + *
  5807. + * \return
  5808. + * ::CUDA_SUCCESS,
  5809. + * ::CUDA_ERROR_DEINITIALIZED,
  5810. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5811. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5812. + * ::CUDA_ERROR_INVALID_VALUE
  5813. + * \notefnerr
  5814. + *
  5815. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5816. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5817. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5818. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5819. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5820. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5821. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
  5822. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  5823. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5824. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5825. + */
  5826. +CUresult CUDAAPI cuMemFreeHost(void *p);
  5827. +
  5828. +/**
  5829. + * \brief Allocates page-locked host memory
  5830. + *
  5831. + * Allocates \p bytesize bytes of host memory that is page-locked and accessible
  5832. + * to the device. The driver tracks the virtual memory ranges allocated with
  5833. + * this function and automatically accelerates calls to functions such as
  5834. + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
  5835. + * it can be read or written with much higher bandwidth than pageable memory
  5836. + * obtained with functions such as ::malloc(). Allocating excessive amounts of
  5837. + * pinned memory may degrade system performance, since it reduces the amount
  5838. + * of memory available to the system for paging. As a result, this function is
  5839. + * best used sparingly to allocate staging areas for data exchange between
  5840. + * host and device.
  5841. + *
  5842. + * The \p Flags parameter enables different options to be specified that
  5843. + * affect the allocation, as follows.
  5844. + *
  5845. + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
  5846. + * considered as pinned memory by all CUDA contexts, not just the one that
  5847. + * performed the allocation.
  5848. + *
  5849. + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
  5850. + * space. The device pointer to the memory may be obtained by calling
  5851. + * ::cuMemHostGetDevicePointer(). This feature is available only on GPUs
  5852. + * with compute capability greater than or equal to 1.1.
  5853. + *
  5854. + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
  5855. + * (WC). WC memory can be transferred across the PCI Express bus more
  5856. + * quickly on some system configurations, but cannot be read efficiently by
  5857. + * most CPUs. WC memory is a good option for buffers that will be written by
  5858. + * the CPU and read by the GPU via mapped pinned memory or host->device
  5859. + * transfers.
  5860. + *
  5861. + * All of these flags are orthogonal to one another: a developer may allocate
  5862. + * memory that is portable, mapped and/or write-combined with no restrictions.
  5863. + *
  5864. + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
  5865. + * order for the ::CU_MEMHOSTALLOC_MAPPED flag to have any effect.
  5866. + *
  5867. + * The ::CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for
  5868. + * devices that do not support mapped pinned memory. The failure is deferred
  5869. + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
  5870. + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
  5871. + *
  5872. + * The memory allocated by this function must be freed with ::cuMemFreeHost().
  5873. + *
  5874. + * Note all host memory allocated using ::cuMemHostAlloc() will automatically
  5875. + * be immediately accessible to all contexts on all devices which support unified
  5876. + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
  5877. + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
  5878. + * that may be used to access this host memory from those contexts is always equal
  5879. + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
  5880. + * is specified, then the function ::cuMemHostGetDevicePointer() must be used
  5881. + * to query the device pointer, even if the context supports unified addressing.
  5882. + * See \ref CUDA_UNIFIED for additional details.
  5883. + *
  5884. + * \param pp - Returned host pointer to page-locked memory
  5885. + * \param bytesize - Requested allocation size in bytes
  5886. + * \param Flags - Flags for allocation request
  5887. + *
  5888. + * \return
  5889. + * ::CUDA_SUCCESS,
  5890. + * ::CUDA_ERROR_DEINITIALIZED,
  5891. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5892. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5893. + * ::CUDA_ERROR_INVALID_VALUE,
  5894. + * ::CUDA_ERROR_OUT_OF_MEMORY
  5895. + * \notefnerr
  5896. + *
  5897. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5898. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5899. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5900. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5901. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5902. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5903. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5904. + * ::cuMemGetAddressRange, ::cuMemGetInfo,
  5905. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  5906. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5907. + */
  5908. +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
  5909. +
  5910. +#if __CUDA_API_VERSION >= 3020
  5911. +/**
  5912. + * \brief Passes back device pointer of mapped pinned memory
  5913. + *
  5914. + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
  5915. + * host buffer \p p allocated by ::cuMemHostAlloc.
  5916. + *
  5917. + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMALLOCHOST_DEVICEMAP
  5918. + * flag was not specified at the time the memory was allocated, or if the
  5919. + * function is called on a GPU that does not support mapped pinned memory.
  5920. + *
  5921. + * \p Flags provides for future releases. For now, it must be set to 0.
  5922. + *
  5923. + * \param pdptr - Returned device pointer
  5924. + * \param p - Host pointer
  5925. + * \param Flags - Options (must be 0)
  5926. + *
  5927. + * \return
  5928. + * ::CUDA_SUCCESS,
  5929. + * ::CUDA_ERROR_DEINITIALIZED,
  5930. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5931. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5932. + * ::CUDA_ERROR_INVALID_VALUE
  5933. + * \notefnerr
  5934. + *
  5935. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  5936. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  5937. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  5938. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  5939. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  5940. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  5941. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  5942. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  5943. + * ::cuMemsetD2D8, ::cuMemsetD2D16,
  5944. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  5945. + */
  5946. +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
  5947. +#endif /* __CUDA_API_VERSION >= 3020 */
  5948. +
  5949. +/**
  5950. + * \brief Passes back flags that were used for a pinned allocation
  5951. + *
  5952. + * Passes back the flags \p pFlags that were specified when allocating
  5953. + * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
  5954. + *
  5955. + * ::cuMemHostGetFlags() will fail if the pointer does not reside in
  5956. + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
  5957. + *
  5958. + * \param pFlags - Returned flags word
  5959. + * \param p - Host pointer
  5960. + *
  5961. + * \return
  5962. + * ::CUDA_SUCCESS,
  5963. + * ::CUDA_ERROR_DEINITIALIZED,
  5964. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5965. + * ::CUDA_ERROR_INVALID_CONTEXT,
  5966. + * ::CUDA_ERROR_INVALID_VALUE
  5967. + * \notefnerr
  5968. + *
  5969. + * \sa ::cuMemAllocHost, ::cuMemHostAlloc
  5970. + */
  5971. +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
  5972. +
  5973. +#if __CUDA_API_VERSION >= 4010
  5974. +
  5975. +/**
  5976. + * \brief Returns a handle to a compute device
  5977. + *
  5978. + * Returns in \p *device a device handle given a PCI bus ID string.
  5979. + *
  5980. + * \param dev - Returned device handle
  5981. + *
  5982. + * \param pciBusId - String in one of the following forms:
  5983. + * [domain]:[bus]:[device].[function]
  5984. + * [domain]:[bus]:[device]
  5985. + * [bus]:[device].[function]
  5986. + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
  5987. + *
  5988. + * \return
  5989. + * ::CUDA_SUCCESS,
  5990. + * ::CUDA_ERROR_DEINITIALIZED,
  5991. + * ::CUDA_ERROR_NOT_INITIALIZED,
  5992. + * ::CUDA_ERROR_INVALID_VALUE,
  5993. + * ::CUDA_ERROR_INVALID_DEVICE
  5994. + * \notefnerr
  5995. + *
  5996. + * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetPCIBusId
  5997. + */
  5998. +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
  5999. +
  6000. +/**
  6001. + * \brief Returns a PCI Bus Id string for the device
  6002. + *
  6003. + * Returns an ASCII string identifying the device \p dev in the NULL-terminated
  6004. + * string pointed to by \p pciBusId. \p len specifies the maximum length of the
  6005. + * string that may be returned.
  6006. + *
  6007. + * \param pciBusId - Returned identifier string for the device in the following format
  6008. + * [domain]:[bus]:[device].[function]
  6009. + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
  6010. + * pciBusId should be large enough to store 13 characters including the NULL-terminator.
  6011. + *
  6012. + * \param len - Maximum length of string to store in \p name
  6013. + *
  6014. + * \param dev - Device to get identifier string for
  6015. + *
  6016. + * \return
  6017. + * ::CUDA_SUCCESS,
  6018. + * ::CUDA_ERROR_DEINITIALIZED,
  6019. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6020. + * ::CUDA_ERROR_INVALID_VALUE,
  6021. + * ::CUDA_ERROR_INVALID_DEVICE
  6022. + * \notefnerr
  6023. + *
  6024. + * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetByPCIBusId
  6025. + */
  6026. +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
  6027. +
  6028. +/**
  6029. + * \brief Gets an interprocess handle for a previously allocated event
  6030. + *
  6031. + * Takes as input a previously allocated event. This event must have been
  6032. + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
  6033. + * flags set. This opaque handle may be copied into other processes and
  6034. + * opened with ::cuIpcOpenEventHandle to allow efficient hardware
  6035. + * synchronization between GPU work in different processes.
  6036. + *
  6037. + * After the event has been been opened in the importing process,
  6038. + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
  6039. + * ::cuEventQuery may be used in either process. Performing operations
  6040. + * on the imported event after the exported event has been freed
  6041. + * with ::cuEventDestroy will result in undefined behavior.
  6042. + *
  6043. + * IPC functionality is restricted to devices with support for unified
  6044. + * addressing on Linux operating systems.
  6045. + *
  6046. + * \param pHandle - Pointer to a user allocated CUipcEventHandle
  6047. + * in which to return the opaque event handle
  6048. + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and
  6049. + * ::CU_EVENT_DISABLE_TIMING flags.
  6050. + *
  6051. + * \return
  6052. + * ::CUDA_SUCCESS,
  6053. + * ::CUDA_ERROR_INVALID_HANDLE,
  6054. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  6055. + * ::CUDA_ERROR_MAP_FAILED
  6056. + *
  6057. + * \sa
  6058. + * ::cuEventCreate,
  6059. + * ::cuEventDestroy,
  6060. + * ::cuEventSynchronize,
  6061. + * ::cuEventQuery,
  6062. + * ::cuStreamWaitEvent,
  6063. + * ::cuIpcOpenEventHandle,
  6064. + * ::cuIpcGetMemHandle,
  6065. + * ::cuIpcOpenMemHandle,
  6066. + * ::cuIpcCloseMemHandle
  6067. + */
  6068. +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
  6069. +
  6070. +/**
  6071. + * \brief Opens an interprocess event handle for use in the current process
  6072. + *
  6073. + * Opens an interprocess event handle exported from another process with
  6074. + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
  6075. + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
  6076. + * This event must be freed with ::cuEventDestroy.
  6077. + *
  6078. + * Performing operations on the imported event after the exported event has
  6079. + * been freed with ::cuEventDestroy will result in undefined behavior.
  6080. + *
  6081. + * IPC functionality is restricted to devices with support for unified
  6082. + * addressing on Linux operating systems.
  6083. + *
  6084. + * \param phEvent - Returns the imported event
  6085. + * \param handle - Interprocess handle to open
  6086. + *
  6087. + * \returns
  6088. + * ::CUDA_SUCCESS,
  6089. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6090. + * ::CUDA_ERROR_MAP_FAILED,
  6091. + * ::CUDA_ERROR_INVALID_HANDLE
  6092. + *
  6093. + * \sa
  6094. + * ::cuEventCreate,
  6095. + * ::cuEventDestroy,
  6096. + * ::cuEventSynchronize,
  6097. + * ::cuEventQuery,
  6098. + * ::cuStreamWaitEvent,
  6099. + * ::cuIpcGetEventHandle,
  6100. + * ::cuIpcGetMemHandle,
  6101. + * ::cuIpcOpenMemHandle,
  6102. + * ::cuIpcCloseMemHandle
  6103. + */
  6104. +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
  6105. +
  6106. +/**
  6107. + * /brief Gets an interprocess memory handle for an existing device memory
  6108. + * allocation
  6109. + *
  6110. + * Takes a pointer to the base of an existing device memory allocation created
  6111. + * with ::cuMemAlloc and exports it for use in another process. This is a
  6112. + * lightweight operation and may be called multiple times on an allocation
  6113. + * without adverse effects.
  6114. + *
  6115. + * If a region of memory is freed with ::cuMemFree and a subsequent call
  6116. + * to ::cuMemAlloc returns memory with the same device address,
  6117. + * ::cuIpcGetMemHandle will return a unique handle for the
  6118. + * new memory.
  6119. + *
  6120. + * IPC functionality is restricted to devices with support for unified
  6121. + * addressing on Linux operating systems.
  6122. + *
  6123. + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
  6124. + * the handle in.
  6125. + * \param dptr - Base pointer to previously allocated device memory
  6126. + *
  6127. + * \returns
  6128. + * ::CUDA_SUCCESS,
  6129. + * ::CUDA_ERROR_INVALID_HANDLE,
  6130. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  6131. + * ::CUDA_ERROR_MAP_FAILED,
  6132. + *
  6133. + * \sa
  6134. + * ::cuMemAlloc,
  6135. + * ::cuMemFree,
  6136. + * ::cuIpcGetEventHandle,
  6137. + * ::cuIpcOpenEventHandle,
  6138. + * ::cuIpcOpenMemHandle,
  6139. + * ::cuIpcCloseMemHandle
  6140. + */
  6141. +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
  6142. +
  6143. +/**
  6144. + * /brief Opens an interprocess memory handle exported from another process
  6145. + * and returns a device pointer usable in the local process.
  6146. + *
  6147. + * Maps memory exported from another process with ::cuIpcGetMemHandle into
  6148. + * the current device address space. For contexts on different devices
  6149. + * ::cuIpcOpenMemHandle can attempt to enable peer access between the
  6150. + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
  6151. + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
  6152. + * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
  6153. + *
  6154. + * Contexts that may open ::CUipcMemHandles are restricted in the following way.
  6155. + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
  6156. + * by one ::CUcontext per ::CUdevice per other process.
  6157. + *
  6158. + * Memory returned from ::cuIpcOpenMemHandle must be freed with
  6159. + * ::cuIpcCloseMemHandle.
  6160. + *
  6161. + * Calling ::cuMemFree on an exported memory region before calling
  6162. + * ::cuIpcCloseMemHandle in the importing context will result in undefined
  6163. + * behavior.
  6164. + *
  6165. + * IPC functionality is restricted to devices with support for unified
  6166. + * addressing on Linux operating systems.
  6167. + *
  6168. + * \param pdptr - Returned device pointer
  6169. + * \param handle - ::CUipcMemHandle to open
  6170. + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
  6171. + *
  6172. + * \returns
  6173. + * ::CUDA_SUCCESS,
  6174. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6175. + * ::CUDA_ERROR_MAP_FAILED,
  6176. + * ::CUDA_ERROR_INVALID_HANDLE,
  6177. + * ::CUDA_ERROR_TOO_MANY_PEERS
  6178. + *
  6179. + * \sa
  6180. + * ::cuMemAlloc,
  6181. + * ::cuMemFree,
  6182. + * ::cuIpcGetEventHandle,
  6183. + * ::cuIpcOpenEventHandle,
  6184. + * ::cuIpcGetMemHandle,
  6185. + * ::cuIpcCloseMemHandle,
  6186. + * ::cuCtxEnablePeerAccess,
  6187. + * ::cuDeviceCanAccessPeer,
  6188. + */
  6189. +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
  6190. +
  6191. +/**
  6192. + * /brief Close memory mapped with ::cuIpcOpenMemHandle
  6193. + *
  6194. + * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation
  6195. + * in the exporting process as well as imported mappings in other processes
  6196. + * will be unaffected.
  6197. + *
  6198. + * Any resources used to enable peer access will be freed if this is the
  6199. + * last mapping using them.
  6200. + *
  6201. + * IPC functionality is restricted to devices with support for unified
  6202. + * addressing on Linux operating systems.
  6203. + *
  6204. + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
  6205. + *
  6206. + * \returns
  6207. + * ::CUDA_SUCCESS,
  6208. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6209. + * ::CUDA_ERROR_MAP_FAILED,
  6210. + * ::CUDA_ERROR_INVALID_HANDLE,
  6211. + *
  6212. + * \sa
  6213. + * ::cuMemAlloc,
  6214. + * ::cuMemFree,
  6215. + * ::cuIpcGetEventHandle,
  6216. + * ::cuIpcOpenEventHandle,
  6217. + * ::cuIpcGetMemHandle,
  6218. + * ::cuIpcOpenMemHandle,
  6219. + */
  6220. +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
  6221. +
  6222. +#endif /* __CUDA_API_VERSION >= 4010 */
  6223. +
  6224. +#if __CUDA_API_VERSION >= 4000
  6225. +/**
  6226. + * \brief Registers an existing host memory range for use by CUDA
  6227. + *
  6228. + * Page-locks the memory range specified by \p p and \p bytesize and maps it
  6229. + * for the device(s) as specified by \p Flags. This memory range also is added
  6230. + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
  6231. + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
  6232. + * directly by the device, it can be read or written with much higher bandwidth
  6233. + * than pageable memory that has not been registered. Page-locking excessive
  6234. + * amounts of memory may degrade system performance, since it reduces the amount
  6235. + * of memory available to the system for paging. As a result, this function is
  6236. + * best used sparingly to register staging areas for data exchange between
  6237. + * host and device.
  6238. + *
  6239. + * This function has limited support on Mac OS X. OS 10.7 or higher is required.
  6240. + *
  6241. + * The \p Flags parameter enables different options to be specified that
  6242. + * affect the allocation, as follows.
  6243. + *
  6244. + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
  6245. + * considered as pinned memory by all CUDA contexts, not just the one that
  6246. + * performed the allocation.
  6247. + *
  6248. + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
  6249. + * space. The device pointer to the memory may be obtained by calling
  6250. + * ::cuMemHostGetDevicePointer(). This feature is available only on GPUs
  6251. + * with compute capability greater than or equal to 1.1.
  6252. + *
  6253. + * All of these flags are orthogonal to one another: a developer may page-lock
  6254. + * memory that is portable or mapped with no restrictions.
  6255. + *
  6256. + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
  6257. + * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect.
  6258. + *
  6259. + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
  6260. + * devices that do not support mapped pinned memory. The failure is deferred
  6261. + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
  6262. + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
  6263. + *
  6264. + * The memory page-locked by this function must be unregistered with
  6265. + * ::cuMemHostUnregister().
  6266. + *
  6267. + * \param p - Host pointer to memory to page-lock
  6268. + * \param bytesize - Size in bytes of the address range to page-lock
  6269. + * \param Flags - Flags for allocation request
  6270. + *
  6271. + * \return
  6272. + * ::CUDA_SUCCESS,
  6273. + * ::CUDA_ERROR_DEINITIALIZED,
  6274. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6275. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6276. + * ::CUDA_ERROR_INVALID_VALUE,
  6277. + * ::CUDA_ERROR_OUT_OF_MEMORY
  6278. + * \notefnerr
  6279. + *
  6280. + * \sa ::cuMemHostUnregister, ::cuMemHostGetFlags, ::cuMemHostGetDevicePointer
  6281. + */
  6282. +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
  6283. +
  6284. +/**
  6285. + * \brief Unregisters a memory range that was registered with ::cuMemHostRegister().
  6286. + *
  6287. + * Unmaps the memory range whose base address is specified by \p p, and makes
  6288. + * it pageable again.
  6289. + *
  6290. + * The base address must be the same one specified to ::cuMemHostRegister().
  6291. + *
  6292. + * \param p - Host pointer to memory to unregister
  6293. + *
  6294. + * \return
  6295. + * ::CUDA_SUCCESS,
  6296. + * ::CUDA_ERROR_DEINITIALIZED,
  6297. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6298. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6299. + * ::CUDA_ERROR_INVALID_VALUE,
  6300. + * ::CUDA_ERROR_OUT_OF_MEMORY
  6301. + * \notefnerr
  6302. + *
  6303. + * \sa ::cuMemHostRegister
  6304. + */
  6305. +CUresult CUDAAPI cuMemHostUnregister(void *p);
  6306. +
  6307. +/**
  6308. + * \brief Copies memory
  6309. + *
  6310. + * Copies data between two pointers.
  6311. + * \p dst and \p src are base pointers of the destination and source, respectively.
  6312. + * \p ByteCount specifies the number of bytes to copy.
  6313. + * Note that this function infers the type of the transfer (host to host, host to
  6314. + * device, device to device, or device to host) from the pointer values. This
  6315. + * function is only allowed in contexts which support unified addressing.
  6316. + * Note that this function is synchronous.
  6317. + *
  6318. + * \param dst - Destination unified virtual address space pointer
  6319. + * \param src - Source unified virtual address space pointer
  6320. + * \param ByteCount - Size of memory copy in bytes
  6321. + *
  6322. + * \return
  6323. + * ::CUDA_SUCCESS,
  6324. + * ::CUDA_ERROR_DEINITIALIZED,
  6325. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6326. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6327. + * ::CUDA_ERROR_INVALID_VALUE
  6328. + * \notefnerr
  6329. + *
  6330. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6331. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6332. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6333. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6334. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
  6335. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6336. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6337. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6338. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6339. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6340. + */
  6341. +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
  6342. +
  6343. +/**
  6344. + * \brief Copies device memory between two contexts
  6345. + *
  6346. + * Copies from device memory in one context to device memory in another
  6347. + * context. \p dstDevice is the base device pointer of the destination memory
  6348. + * and \p dstContext is the destination context. \p srcDevice is the base
  6349. + * device pointer of the source memory and \p srcContext is the source pointer.
  6350. + * \p ByteCount specifies the number of bytes to copy.
  6351. +
  6352. + * Note that this function is asynchronous with respect to the host, but
  6353. + * serialized with respect all pending and future asynchronous work in to the
  6354. + * current context, \p srcContext, and \p dstContext (use ::cuMemcpyPeerAsync
  6355. + * to avoid this synchronization).
  6356. + *
  6357. + * \param dstDevice - Destination device pointer
  6358. + * \param dstContext - Destination context
  6359. + * \param srcDevice - Source device pointer
  6360. + * \param srcContext - Source context
  6361. + * \param ByteCount - Size of memory copy in bytes
  6362. + *
  6363. + * \return
  6364. + * ::CUDA_SUCCESS,
  6365. + * ::CUDA_ERROR_DEINITIALIZED,
  6366. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6367. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6368. + * ::CUDA_ERROR_INVALID_VALUE
  6369. + * \notefnerr
  6370. + *
  6371. + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
  6372. + * ::cuMemcpy3DPeerAsync
  6373. + */
  6374. +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
  6375. +
  6376. +#endif /* __CUDA_API_VERSION >= 4000 */
  6377. +
  6378. +#if __CUDA_API_VERSION >= 3020
  6379. +/**
  6380. + * \brief Copies memory from Host to Device
  6381. + *
  6382. + * Copies from host memory to device memory. \p dstDevice and \p srcHost are
  6383. + * the base addresses of the destination and source, respectively. \p ByteCount
  6384. + * specifies the number of bytes to copy. Note that this function is
  6385. + * synchronous.
  6386. + *
  6387. + * \param dstDevice - Destination device pointer
  6388. + * \param srcHost - Source host pointer
  6389. + * \param ByteCount - Size of memory copy in bytes
  6390. + *
  6391. + * \return
  6392. + * ::CUDA_SUCCESS,
  6393. + * ::CUDA_ERROR_DEINITIALIZED,
  6394. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6395. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6396. + * ::CUDA_ERROR_INVALID_VALUE
  6397. + * \notefnerr
  6398. + *
  6399. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6400. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6401. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6402. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6403. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6404. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6405. + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6406. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6407. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6408. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6409. + */
  6410. +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
  6411. +
  6412. +/**
  6413. + * \brief Copies memory from Device to Host
  6414. + *
  6415. + * Copies from device to host memory. \p dstHost and \p srcDevice specify the
  6416. + * base pointers of the destination and source, respectively. \p ByteCount
  6417. + * specifies the number of bytes to copy. Note that this function is
  6418. + * synchronous.
  6419. + *
  6420. + * \param dstHost - Destination host pointer
  6421. + * \param srcDevice - Source device pointer
  6422. + * \param ByteCount - Size of memory copy in bytes
  6423. + *
  6424. + * \return
  6425. + * ::CUDA_SUCCESS,
  6426. + * ::CUDA_ERROR_DEINITIALIZED,
  6427. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6428. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6429. + * ::CUDA_ERROR_INVALID_VALUE
  6430. + * \notefnerr
  6431. + *
  6432. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6433. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6434. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6435. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6436. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6437. + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6438. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6439. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6440. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6441. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6442. + */
  6443. +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
  6444. +
  6445. +/**
  6446. + * \brief Copies memory from Device to Device
  6447. + *
  6448. + * Copies from device memory to device memory. \p dstDevice and \p srcDevice
  6449. + * are the base pointers of the destination and source, respectively.
  6450. + * \p ByteCount specifies the number of bytes to copy. Note that this function
  6451. + * is asynchronous.
  6452. + *
  6453. + * \param dstDevice - Destination device pointer
  6454. + * \param srcDevice - Source device pointer
  6455. + * \param ByteCount - Size of memory copy in bytes
  6456. + *
  6457. + * \return
  6458. + * ::CUDA_SUCCESS,
  6459. + * ::CUDA_ERROR_DEINITIALIZED,
  6460. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6461. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6462. + * ::CUDA_ERROR_INVALID_VALUE
  6463. + * \notefnerr
  6464. + *
  6465. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6466. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6467. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6468. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6469. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
  6470. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6471. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6472. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6473. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6474. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6475. + */
  6476. +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
  6477. +
  6478. +/**
  6479. + * \brief Copies memory from Device to Array
  6480. + *
  6481. + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
  6482. + * specify the CUDA array handle and starting index of the destination data.
  6483. + * \p srcDevice specifies the base pointer of the source. \p ByteCount
  6484. + * specifies the number of bytes to copy.
  6485. + *
  6486. + * \param dstArray - Destination array
  6487. + * \param dstOffset - Offset in bytes of destination array
  6488. + * \param srcDevice - Source device pointer
  6489. + * \param ByteCount - Size of memory copy in bytes
  6490. + *
  6491. + * \return
  6492. + * ::CUDA_SUCCESS,
  6493. + * ::CUDA_ERROR_DEINITIALIZED,
  6494. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6495. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6496. + * ::CUDA_ERROR_INVALID_VALUE
  6497. + * \notefnerr
  6498. + *
  6499. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6500. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6501. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6502. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6503. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6504. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6505. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6506. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6507. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6508. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6509. + */
  6510. +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
  6511. +
  6512. +/**
  6513. + * \brief Copies memory from Array to Device
  6514. + *
  6515. + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
  6516. + * base pointer of the destination and must be naturally aligned with the CUDA
  6517. + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
  6518. + * and the offset in bytes into the array where the copy is to begin.
  6519. + * \p ByteCount specifies the number of bytes to copy and must be evenly
  6520. + * divisible by the array element size.
  6521. + *
  6522. + * \param dstDevice - Destination device pointer
  6523. + * \param srcArray - Source array
  6524. + * \param srcOffset - Offset in bytes of source array
  6525. + * \param ByteCount - Size of memory copy in bytes
  6526. + *
  6527. + * \return
  6528. + * ::CUDA_SUCCESS,
  6529. + * ::CUDA_ERROR_DEINITIALIZED,
  6530. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6531. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6532. + * ::CUDA_ERROR_INVALID_VALUE
  6533. + * \notefnerr
  6534. + *
  6535. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6536. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6537. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6538. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
  6539. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6540. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6541. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6542. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6543. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6544. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6545. + */
  6546. +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
  6547. +
  6548. +/**
  6549. + * \brief Copies memory from Host to Array
  6550. + *
  6551. + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
  6552. + * specify the CUDA array handle and starting offset in bytes of the destination
  6553. + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies
  6554. + * the number of bytes to copy.
  6555. + *
  6556. + * \param dstArray - Destination array
  6557. + * \param dstOffset - Offset in bytes of destination array
  6558. + * \param srcHost - Source host pointer
  6559. + * \param ByteCount - Size of memory copy in bytes
  6560. + *
  6561. + * \return
  6562. + * ::CUDA_SUCCESS,
  6563. + * ::CUDA_ERROR_DEINITIALIZED,
  6564. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6565. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6566. + * ::CUDA_ERROR_INVALID_VALUE
  6567. + * \notefnerr
  6568. + *
  6569. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6570. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6571. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6572. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6573. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6574. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
  6575. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6576. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6577. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6578. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6579. + */
  6580. +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
  6581. +
  6582. +/**
  6583. + * \brief Copies memory from Array to Host
  6584. + *
  6585. + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
  6586. + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
  6587. + * array handle and starting offset in bytes of the source data.
  6588. + * \p ByteCount specifies the number of bytes to copy.
  6589. + *
  6590. + * \param dstHost - Destination device pointer
  6591. + * \param srcArray - Source array
  6592. + * \param srcOffset - Offset in bytes of source array
  6593. + * \param ByteCount - Size of memory copy in bytes
  6594. + *
  6595. + * \return
  6596. + * ::CUDA_SUCCESS,
  6597. + * ::CUDA_ERROR_DEINITIALIZED,
  6598. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6599. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6600. + * ::CUDA_ERROR_INVALID_VALUE
  6601. + * \notefnerr
  6602. + *
  6603. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6604. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6605. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6606. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6607. + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6608. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6609. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6610. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6611. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6612. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6613. + */
  6614. +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
  6615. +
  6616. +/**
  6617. + * \brief Copies memory from Array to Array
  6618. + *
  6619. + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
  6620. + * specify the handles of the destination and source CUDA arrays for the copy,
  6621. + * respectively. \p dstOffset and \p srcOffset specify the destination and
  6622. + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
  6623. + * bytes to be copied. The size of the elements in the CUDA arrays need not be
  6624. + * the same format, but the elements must be the same size; and count must be
  6625. + * evenly divisible by that size.
  6626. + *
  6627. + * \param dstArray - Destination array
  6628. + * \param dstOffset - Offset in bytes of destination array
  6629. + * \param srcArray - Source array
  6630. + * \param srcOffset - Offset in bytes of source array
  6631. + * \param ByteCount - Size of memory copy in bytes
  6632. + *
  6633. + * \return
  6634. + * ::CUDA_SUCCESS,
  6635. + * ::CUDA_ERROR_DEINITIALIZED,
  6636. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6637. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6638. + * ::CUDA_ERROR_INVALID_VALUE
  6639. + * \notefnerr
  6640. + *
  6641. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6642. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6643. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6644. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
  6645. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6646. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6647. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6648. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6649. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6650. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6651. + */
  6652. +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
  6653. +
  6654. +/**
  6655. + * \brief Copies memory for 2D arrays
  6656. + *
  6657. + * Perform a 2D memory copy according to the parameters specified in \p pCopy.
  6658. + * The ::CUDA_MEMCPY2D structure is defined as:
  6659. + *
  6660. + * \code
  6661. + typedef struct CUDA_MEMCPY2D_st {
  6662. + unsigned int srcXInBytes, srcY;
  6663. + CUmemorytype srcMemoryType;
  6664. + const void *srcHost;
  6665. + CUdeviceptr srcDevice;
  6666. + CUarray srcArray;
  6667. + unsigned int srcPitch;
  6668. +
  6669. + unsigned int dstXInBytes, dstY;
  6670. + CUmemorytype dstMemoryType;
  6671. + void *dstHost;
  6672. + CUdeviceptr dstDevice;
  6673. + CUarray dstArray;
  6674. + unsigned int dstPitch;
  6675. +
  6676. + unsigned int WidthInBytes;
  6677. + unsigned int Height;
  6678. + } CUDA_MEMCPY2D;
  6679. + * \endcode
  6680. + * where:
  6681. + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
  6682. + * source and destination, respectively; ::CUmemorytype_enum is defined as:
  6683. + *
  6684. + * \code
  6685. + typedef enum CUmemorytype_enum {
  6686. + CU_MEMORYTYPE_HOST = 0x01,
  6687. + CU_MEMORYTYPE_DEVICE = 0x02,
  6688. + CU_MEMORYTYPE_ARRAY = 0x03,
  6689. + CU_MEMORYTYPE_UNIFIED = 0x04
  6690. + } CUmemorytype;
  6691. + * \endcode
  6692. + *
  6693. + * \par
  6694. + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
  6695. + * specify the (unified virtual address space) base address of the source data
  6696. + * and the bytes per row to apply. ::srcArray is ignored.
  6697. + * This value may be used only if unified addressing is supported in the calling
  6698. + * context.
  6699. + *
  6700. + * \par
  6701. + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
  6702. + * specify the (host) base address of the source data and the bytes per row to
  6703. + * apply. ::srcArray is ignored.
  6704. + *
  6705. + * \par
  6706. + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
  6707. + * specify the (device) base address of the source data and the bytes per row
  6708. + * to apply. ::srcArray is ignored.
  6709. + *
  6710. + * \par
  6711. + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
  6712. + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
  6713. + * ignored.
  6714. + *
  6715. + * \par
  6716. + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
  6717. + * specify the (host) base address of the destination data and the bytes per
  6718. + * row to apply. ::dstArray is ignored.
  6719. + *
  6720. + * \par
  6721. + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
  6722. + * specify the (unified virtual address space) base address of the source data
  6723. + * and the bytes per row to apply. ::dstArray is ignored.
  6724. + * This value may be used only if unified addressing is supported in the calling
  6725. + * context.
  6726. + *
  6727. + * \par
  6728. + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
  6729. + * specify the (device) base address of the destination data and the bytes per
  6730. + * row to apply. ::dstArray is ignored.
  6731. + *
  6732. + * \par
  6733. + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
  6734. + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
  6735. + * ignored.
  6736. + *
  6737. + * - ::srcXInBytes and ::srcY specify the base address of the source data for
  6738. + * the copy.
  6739. + *
  6740. + * \par
  6741. + * For host pointers, the starting address is
  6742. + * \code
  6743. + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
  6744. + * \endcode
  6745. + *
  6746. + * \par
  6747. + * For device pointers, the starting address is
  6748. + * \code
  6749. + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
  6750. + * \endcode
  6751. + *
  6752. + * \par
  6753. + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
  6754. + * element size.
  6755. + *
  6756. + * - ::dstXInBytes and ::dstY specify the base address of the destination data
  6757. + * for the copy.
  6758. + *
  6759. + * \par
  6760. + * For host pointers, the base address is
  6761. + * \code
  6762. + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
  6763. + * \endcode
  6764. + *
  6765. + * \par
  6766. + * For device pointers, the starting address is
  6767. + * \code
  6768. + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
  6769. + * \endcode
  6770. + *
  6771. + * \par
  6772. + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
  6773. + * element size.
  6774. + *
  6775. + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
  6776. + * the 2D copy being performed.
  6777. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
  6778. + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
  6779. + * ::WidthInBytes + dstXInBytes.
  6780. + *
  6781. + * \par
  6782. + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
  6783. + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
  6784. + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
  6785. + * (device to device, CUDA array to device, CUDA array to CUDA array),
  6786. + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
  6787. + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
  6788. + * significantly slower in the cases where ::cuMemcpy2D() would have returned
  6789. + * an error code.
  6790. + *
  6791. + * \param pCopy - Parameters for the memory copy
  6792. + *
  6793. + * \return
  6794. + * ::CUDA_SUCCESS,
  6795. + * ::CUDA_ERROR_DEINITIALIZED,
  6796. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6797. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6798. + * ::CUDA_ERROR_INVALID_VALUE
  6799. + * \notefnerr
  6800. + *
  6801. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6802. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6803. + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  6804. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6805. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6806. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6807. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6808. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6809. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6810. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6811. + */
  6812. +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
  6813. +
  6814. +/**
  6815. + * \brief Copies memory for 2D arrays
  6816. + *
  6817. + * Perform a 2D memory copy according to the parameters specified in \p pCopy.
  6818. + * The ::CUDA_MEMCPY2D structure is defined as:
  6819. + *
  6820. + * \code
  6821. + typedef struct CUDA_MEMCPY2D_st {
  6822. + unsigned int srcXInBytes, srcY;
  6823. + CUmemorytype srcMemoryType;
  6824. + const void *srcHost;
  6825. + CUdeviceptr srcDevice;
  6826. + CUarray srcArray;
  6827. + unsigned int srcPitch;
  6828. + unsigned int dstXInBytes, dstY;
  6829. + CUmemorytype dstMemoryType;
  6830. + void *dstHost;
  6831. + CUdeviceptr dstDevice;
  6832. + CUarray dstArray;
  6833. + unsigned int dstPitch;
  6834. + unsigned int WidthInBytes;
  6835. + unsigned int Height;
  6836. + } CUDA_MEMCPY2D;
  6837. + * \endcode
  6838. + * where:
  6839. + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
  6840. + * source and destination, respectively; ::CUmemorytype_enum is defined as:
  6841. + *
  6842. + * \code
  6843. + typedef enum CUmemorytype_enum {
  6844. + CU_MEMORYTYPE_HOST = 0x01,
  6845. + CU_MEMORYTYPE_DEVICE = 0x02,
  6846. + CU_MEMORYTYPE_ARRAY = 0x03,
  6847. + CU_MEMORYTYPE_UNIFIED = 0x04
  6848. + } CUmemorytype;
  6849. + * \endcode
  6850. + *
  6851. + * \par
  6852. + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
  6853. + * specify the (unified virtual address space) base address of the source data
  6854. + * and the bytes per row to apply. ::srcArray is ignored.
  6855. + * This value may be used only if unified addressing is supported in the calling
  6856. + * context.
  6857. + *
  6858. + * \par
  6859. + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
  6860. + * specify the (host) base address of the source data and the bytes per row to
  6861. + * apply. ::srcArray is ignored.
  6862. + *
  6863. + * \par
  6864. + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
  6865. + * specify the (device) base address of the source data and the bytes per row
  6866. + * to apply. ::srcArray is ignored.
  6867. + *
  6868. + * \par
  6869. + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
  6870. + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
  6871. + * ignored.
  6872. + *
  6873. + * \par
  6874. + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
  6875. + * specify the (unified virtual address space) base address of the source data
  6876. + * and the bytes per row to apply. ::dstArray is ignored.
  6877. + * This value may be used only if unified addressing is supported in the calling
  6878. + * context.
  6879. + *
  6880. + * \par
  6881. + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
  6882. + * specify the (host) base address of the destination data and the bytes per
  6883. + * row to apply. ::dstArray is ignored.
  6884. + *
  6885. + * \par
  6886. + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
  6887. + * specify the (device) base address of the destination data and the bytes per
  6888. + * row to apply. ::dstArray is ignored.
  6889. + *
  6890. + * \par
  6891. + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
  6892. + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
  6893. + * ignored.
  6894. + *
  6895. + * - ::srcXInBytes and ::srcY specify the base address of the source data for
  6896. + * the copy.
  6897. + *
  6898. + * \par
  6899. + * For host pointers, the starting address is
  6900. + * \code
  6901. + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
  6902. + * \endcode
  6903. + *
  6904. + * \par
  6905. + * For device pointers, the starting address is
  6906. + * \code
  6907. + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
  6908. + * \endcode
  6909. + *
  6910. + * \par
  6911. + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
  6912. + * element size.
  6913. + *
  6914. + * - ::dstXInBytes and ::dstY specify the base address of the destination data
  6915. + * for the copy.
  6916. + *
  6917. + * \par
  6918. + * For host pointers, the base address is
  6919. + * \code
  6920. + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
  6921. + * \endcode
  6922. + *
  6923. + * \par
  6924. + * For device pointers, the starting address is
  6925. + * \code
  6926. + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
  6927. + * \endcode
  6928. + *
  6929. + * \par
  6930. + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
  6931. + * element size.
  6932. + *
  6933. + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
  6934. + * the 2D copy being performed.
  6935. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
  6936. + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
  6937. + * ::WidthInBytes + dstXInBytes.
  6938. + *
  6939. + * \par
  6940. + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
  6941. + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
  6942. + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
  6943. + * (device to device, CUDA array to device, CUDA array to CUDA array),
  6944. + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
  6945. + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
  6946. + * significantly slower in the cases where ::cuMemcpy2D() would have returned
  6947. + * an error code.
  6948. + *
  6949. + * \param pCopy - Parameters for the memory copy
  6950. + *
  6951. + * \return
  6952. + * ::CUDA_SUCCESS,
  6953. + * ::CUDA_ERROR_DEINITIALIZED,
  6954. + * ::CUDA_ERROR_NOT_INITIALIZED,
  6955. + * ::CUDA_ERROR_INVALID_CONTEXT,
  6956. + * ::CUDA_ERROR_INVALID_VALUE
  6957. + * \notefnerr
  6958. + *
  6959. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  6960. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  6961. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
  6962. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  6963. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  6964. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  6965. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  6966. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  6967. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  6968. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  6969. + */
  6970. +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
  6971. +
  6972. +/**
  6973. + * \brief Copies memory for 3D arrays
  6974. + *
  6975. + * Perform a 3D memory copy according to the parameters specified in
  6976. + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
  6977. + *
  6978. + * \code
  6979. + typedef struct CUDA_MEMCPY3D_st {
  6980. +
  6981. + unsigned int srcXInBytes, srcY, srcZ;
  6982. + unsigned int srcLOD;
  6983. + CUmemorytype srcMemoryType;
  6984. + const void *srcHost;
  6985. + CUdeviceptr srcDevice;
  6986. + CUarray srcArray;
  6987. + unsigned int srcPitch; // ignored when src is array
  6988. + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
  6989. +
  6990. + unsigned int dstXInBytes, dstY, dstZ;
  6991. + unsigned int dstLOD;
  6992. + CUmemorytype dstMemoryType;
  6993. + void *dstHost;
  6994. + CUdeviceptr dstDevice;
  6995. + CUarray dstArray;
  6996. + unsigned int dstPitch; // ignored when dst is array
  6997. + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
  6998. +
  6999. + unsigned int WidthInBytes;
  7000. + unsigned int Height;
  7001. + unsigned int Depth;
  7002. + } CUDA_MEMCPY3D;
  7003. + * \endcode
  7004. + * where:
  7005. + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
  7006. + * source and destination, respectively; ::CUmemorytype_enum is defined as:
  7007. + *
  7008. + * \code
  7009. + typedef enum CUmemorytype_enum {
  7010. + CU_MEMORYTYPE_HOST = 0x01,
  7011. + CU_MEMORYTYPE_DEVICE = 0x02,
  7012. + CU_MEMORYTYPE_ARRAY = 0x03,
  7013. + CU_MEMORYTYPE_UNIFIED = 0x04
  7014. + } CUmemorytype;
  7015. + * \endcode
  7016. + *
  7017. + * \par
  7018. + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
  7019. + * specify the (unified virtual address space) base address of the source data
  7020. + * and the bytes per row to apply. ::srcArray is ignored.
  7021. + * This value may be used only if unified addressing is supported in the calling
  7022. + * context.
  7023. + *
  7024. + * \par
  7025. + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
  7026. + * ::srcHeight specify the (host) base address of the source data, the bytes
  7027. + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
  7028. + * ignored.
  7029. + *
  7030. + * \par
  7031. + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
  7032. + * ::srcHeight specify the (device) base address of the source data, the bytes
  7033. + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
  7034. + * ignored.
  7035. + *
  7036. + * \par
  7037. + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
  7038. + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
  7039. + * ::srcHeight are ignored.
  7040. + *
  7041. + * \par
  7042. + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
  7043. + * specify the (unified virtual address space) base address of the source data
  7044. + * and the bytes per row to apply. ::dstArray is ignored.
  7045. + * This value may be used only if unified addressing is supported in the calling
  7046. + * context.
  7047. + *
  7048. + * \par
  7049. + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
  7050. + * specify the (host) base address of the destination data, the bytes per row,
  7051. + * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
  7052. + *
  7053. + * \par
  7054. + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
  7055. + * specify the (device) base address of the destination data, the bytes per
  7056. + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
  7057. + *
  7058. + * \par
  7059. + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
  7060. + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
  7061. + * ::dstHeight are ignored.
  7062. + *
  7063. + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
  7064. + * data for the copy.
  7065. + *
  7066. + * \par
  7067. + * For host pointers, the starting address is
  7068. + * \code
  7069. + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
  7070. + * \endcode
  7071. + *
  7072. + * \par
  7073. + * For device pointers, the starting address is
  7074. + * \code
  7075. + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
  7076. + * \endcode
  7077. + *
  7078. + * \par
  7079. + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
  7080. + * element size.
  7081. + *
  7082. + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
  7083. + * destination data for the copy.
  7084. + *
  7085. + * \par
  7086. + * For host pointers, the base address is
  7087. + * \code
  7088. + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
  7089. + * \endcode
  7090. + *
  7091. + * \par
  7092. + * For device pointers, the starting address is
  7093. + * \code
  7094. + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
  7095. + * \endcode
  7096. + *
  7097. + * \par
  7098. + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
  7099. + * element size.
  7100. + *
  7101. + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
  7102. + * and depth of the 3D copy being performed.
  7103. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
  7104. + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
  7105. + * ::WidthInBytes + dstXInBytes.
  7106. + * - If specified, ::srcHeight must be greater than or equal to ::Height +
  7107. + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
  7108. + *
  7109. + * \par
  7110. + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
  7111. + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
  7112. + *
  7113. + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
  7114. + * set to 0.
  7115. + *
  7116. + * \param pCopy - Parameters for the memory copy
  7117. + *
  7118. + * \return
  7119. + * ::CUDA_SUCCESS,
  7120. + * ::CUDA_ERROR_DEINITIALIZED,
  7121. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7122. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7123. + * ::CUDA_ERROR_INVALID_VALUE
  7124. + * \notefnerr
  7125. + *
  7126. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7127. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7128. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7129. + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7130. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7131. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7132. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7133. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7134. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  7135. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  7136. + */
  7137. +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
  7138. +#endif /* __CUDA_API_VERSION >= 3020 */
  7139. +
  7140. +#if __CUDA_API_VERSION >= 4000
  7141. +/**
  7142. + * \brief Copies memory between contexts
  7143. + *
  7144. + * Perform a 3D memory copy according to the parameters specified in
  7145. + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure
  7146. + * for documentation of its parameters.
  7147. + *
  7148. + * Note that this function is synchronous with respect to the host only if
  7149. + * the source or destination memory is of type ::CU_MEMORYTYPE_HOST.
  7150. + * Note also that this copy is serialized with respect all pending and future
  7151. + * asynchronous work in to the current context, the copy's source context,
  7152. + * and the copy's destination context (use ::cuMemcpy3DPeerAsync to avoid
  7153. + * this synchronization).
  7154. + *
  7155. + * \param pCopy - Parameters for the memory copy
  7156. + *
  7157. + * \return
  7158. + * ::CUDA_SUCCESS,
  7159. + * ::CUDA_ERROR_DEINITIALIZED,
  7160. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7161. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7162. + * ::CUDA_ERROR_INVALID_VALUE
  7163. + * \notefnerr
  7164. + *
  7165. + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
  7166. + * ::cuMemcpy3DPeerAsync
  7167. + */
  7168. +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
  7169. +
  7170. +/**
  7171. + * \brief Copies memory asynchronously
  7172. + *
  7173. + * Copies data between two pointers.
  7174. + * \p dst and \p src are base pointers of the destination and source, respectively.
  7175. + * \p ByteCount specifies the number of bytes to copy.
  7176. + * Note that this function infers the type of the transfer (host to host, host to
  7177. + * device, device to device, or device to host) from the pointer values. This
  7178. + * function is only allowed in contexts which support unified addressing.
  7179. + * Note that this function is asynchronous and can optionally be associated to
  7180. + * a stream by passing a non-zero \p hStream argument
  7181. + *
  7182. + * \param dst - Destination unified virtual address space pointer
  7183. + * \param src - Source unified virtual address space pointer
  7184. + * \param ByteCount - Size of memory copy in bytes
  7185. + * \param hStream - Stream identifier
  7186. + *
  7187. + * \return
  7188. + * ::CUDA_SUCCESS,
  7189. + * ::CUDA_ERROR_DEINITIALIZED,
  7190. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7191. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7192. + * ::CUDA_ERROR_INVALID_VALUE
  7193. + * \notefnerr
  7194. + *
  7195. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7196. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7197. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7198. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7199. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
  7200. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7201. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7202. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7203. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7204. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7205. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7206. + * ::cuMemsetD32, ::cuMemsetD32Async
  7207. + */
  7208. +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
  7209. +
  7210. +/**
  7211. + * \brief Copies device memory between two contexts asynchronously.
  7212. + *
  7213. + * Copies from device memory in one context to device memory in another
  7214. + * context. \p dstDevice is the base device pointer of the destination memory
  7215. + * and \p dstContext is the destination context. \p srcDevice is the base
  7216. + * device pointer of the source memory and \p srcContext is the source pointer.
  7217. + * \p ByteCount specifies the number of bytes to copy. Note that this function
  7218. + * is asynchronous with respect to the host and all work in other streams in
  7219. + * other devices.
  7220. + *
  7221. + * \param dstDevice - Destination device pointer
  7222. + * \param dstContext - Destination context
  7223. + * \param srcDevice - Source device pointer
  7224. + * \param srcContext - Source context
  7225. + * \param ByteCount - Size of memory copy in bytes
  7226. + * \param hStream - Stream identifier
  7227. + *
  7228. + * \return
  7229. + * ::CUDA_SUCCESS,
  7230. + * ::CUDA_ERROR_DEINITIALIZED,
  7231. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7232. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7233. + * ::CUDA_ERROR_INVALID_VALUE
  7234. + * \notefnerr
  7235. + *
  7236. + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
  7237. + * ::cuMemcpy3DPeerAsync
  7238. + */
  7239. +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
  7240. +#endif /* __CUDA_API_VERSION >= 4000 */
  7241. +
  7242. +#if __CUDA_API_VERSION >= 3020
  7243. +/**
  7244. + * \brief Copies memory from Host to Device
  7245. + *
  7246. + * Copies from host memory to device memory. \p dstDevice and \p srcHost are
  7247. + * the base addresses of the destination and source, respectively. \p ByteCount
  7248. + * specifies the number of bytes to copy.
  7249. + *
  7250. + * ::cuMemcpyHtoDAsync() is asynchronous and can optionally be associated to a
  7251. + * stream by passing a non-zero \p hStream argument. It only works on
  7252. + * page-locked memory and returns an error if a pointer to pageable memory is
  7253. + * passed as input.
  7254. + *
  7255. + * \param dstDevice - Destination device pointer
  7256. + * \param srcHost - Source host pointer
  7257. + * \param ByteCount - Size of memory copy in bytes
  7258. + * \param hStream - Stream identifier
  7259. + *
  7260. + * \return
  7261. + * ::CUDA_SUCCESS,
  7262. + * ::CUDA_ERROR_DEINITIALIZED,
  7263. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7264. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7265. + * ::CUDA_ERROR_INVALID_VALUE
  7266. + * \notefnerr
  7267. + *
  7268. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7269. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7270. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7271. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7272. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7273. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7274. + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
  7275. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7276. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7277. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7278. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7279. + * ::cuMemsetD32, ::cuMemsetD32Async
  7280. + */
  7281. +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
  7282. +
  7283. +/**
  7284. + * \brief Copies memory from Device to Host
  7285. + *
  7286. + * Copies from device to host memory. \p dstHost and \p srcDevice specify the
  7287. + * base pointers of the destination and source, respectively. \p ByteCount
  7288. + * specifies the number of bytes to copy.
  7289. + *
  7290. + * ::cuMemcpyDtoHAsync() is asynchronous and can optionally be associated to a
  7291. + * stream by passing a non-zero \p hStream argument. It only works on
  7292. + * page-locked memory and returns an error if a pointer to pageable memory is
  7293. + * passed as input.
  7294. + *
  7295. + * \param dstHost - Destination host pointer
  7296. + * \param srcDevice - Source device pointer
  7297. + * \param ByteCount - Size of memory copy in bytes
  7298. + * \param hStream - Stream identifier
  7299. + *
  7300. + * \return
  7301. + * ::CUDA_SUCCESS,
  7302. + * ::CUDA_ERROR_DEINITIALIZED,
  7303. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7304. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7305. + * ::CUDA_ERROR_INVALID_VALUE
  7306. + * \notefnerr
  7307. + *
  7308. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7309. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7310. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7311. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7312. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7313. + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7314. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7315. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7316. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7317. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7318. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7319. + * ::cuMemsetD32, ::cuMemsetD32Async
  7320. + */
  7321. +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
  7322. +
  7323. +/**
  7324. + * \brief Copies memory from Device to Device
  7325. + *
  7326. + * Copies from device memory to device memory. \p dstDevice and \p srcDevice
  7327. + * are the base pointers of the destination and source, respectively.
  7328. + * \p ByteCount specifies the number of bytes to copy. Note that this function
  7329. + * is asynchronous and can optionally be associated to a stream by passing a
  7330. + * non-zero \p hStream argument
  7331. + *
  7332. + * \param dstDevice - Destination device pointer
  7333. + * \param srcDevice - Source device pointer
  7334. + * \param ByteCount - Size of memory copy in bytes
  7335. + * \param hStream - Stream identifier
  7336. + *
  7337. + * \return
  7338. + * ::CUDA_SUCCESS,
  7339. + * ::CUDA_ERROR_DEINITIALIZED,
  7340. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7341. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7342. + * ::CUDA_ERROR_INVALID_VALUE
  7343. + * \notefnerr
  7344. + *
  7345. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7346. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7347. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7348. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7349. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
  7350. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7351. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7352. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7353. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7354. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7355. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7356. + * ::cuMemsetD32, ::cuMemsetD32Async
  7357. + */
  7358. +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
  7359. +
  7360. +/**
  7361. + * \brief Copies memory from Host to Array
  7362. + *
  7363. + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
  7364. + * specify the CUDA array handle and starting offset in bytes of the
  7365. + * destination data. \p srcHost specifies the base address of the source.
  7366. + * \p ByteCount specifies the number of bytes to copy.
  7367. + *
  7368. + * ::cuMemcpyHtoAAsync() is asynchronous and can optionally be associated to a
  7369. + * stream by passing a non-zero \p hStream argument. It only works on
  7370. + * page-locked memory and returns an error if a pointer to pageable memory is
  7371. + * passed as input.
  7372. + *
  7373. + * \param dstArray - Destination array
  7374. + * \param dstOffset - Offset in bytes of destination array
  7375. + * \param srcHost - Source host pointer
  7376. + * \param ByteCount - Size of memory copy in bytes
  7377. + * \param hStream - Stream identifier
  7378. + *
  7379. + * \return
  7380. + * ::CUDA_SUCCESS,
  7381. + * ::CUDA_ERROR_DEINITIALIZED,
  7382. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7383. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7384. + * ::CUDA_ERROR_INVALID_VALUE
  7385. + * \notefnerr
  7386. + *
  7387. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7388. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7389. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7390. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7391. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7392. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
  7393. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7394. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7395. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7396. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7397. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7398. + * ::cuMemsetD32, ::cuMemsetD32Async
  7399. + */
  7400. +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
  7401. +
  7402. +/**
  7403. + * \brief Copies memory from Array to Host
  7404. + *
  7405. + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
  7406. + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
  7407. + * array handle and starting offset in bytes of the source data.
  7408. + * \p ByteCount specifies the number of bytes to copy.
  7409. + *
  7410. + * ::cuMemcpyAtoHAsync() is asynchronous and can optionally be associated to a
  7411. + * stream by passing a non-zero \p stream argument. It only works on
  7412. + * page-locked host memory and returns an error if a pointer to pageable
  7413. + * memory is passed as input.
  7414. + *
  7415. + * \param dstHost - Destination pointer
  7416. + * \param srcArray - Source array
  7417. + * \param srcOffset - Offset in bytes of source array
  7418. + * \param ByteCount - Size of memory copy in bytes
  7419. + * \param hStream - Stream identifier
  7420. + *
  7421. + * \return
  7422. + * ::CUDA_SUCCESS,
  7423. + * ::CUDA_ERROR_DEINITIALIZED,
  7424. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7425. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7426. + * ::CUDA_ERROR_INVALID_VALUE
  7427. + * \notefnerr
  7428. + *
  7429. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7430. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7431. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7432. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7433. + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7434. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7435. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7436. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7437. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7438. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7439. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7440. + * ::cuMemsetD32, ::cuMemsetD32Async
  7441. + */
  7442. +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
  7443. +
  7444. +/**
  7445. + * \brief Copies memory for 2D arrays
  7446. + *
  7447. + * Perform a 2D memory copy according to the parameters specified in \p pCopy.
  7448. + * The ::CUDA_MEMCPY2D structure is defined as:
  7449. + *
  7450. + * \code
  7451. + typedef struct CUDA_MEMCPY2D_st {
  7452. + unsigned int srcXInBytes, srcY;
  7453. + CUmemorytype srcMemoryType;
  7454. + const void *srcHost;
  7455. + CUdeviceptr srcDevice;
  7456. + CUarray srcArray;
  7457. + unsigned int srcPitch;
  7458. + unsigned int dstXInBytes, dstY;
  7459. + CUmemorytype dstMemoryType;
  7460. + void *dstHost;
  7461. + CUdeviceptr dstDevice;
  7462. + CUarray dstArray;
  7463. + unsigned int dstPitch;
  7464. + unsigned int WidthInBytes;
  7465. + unsigned int Height;
  7466. + } CUDA_MEMCPY2D;
  7467. + * \endcode
  7468. + * where:
  7469. + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
  7470. + * source and destination, respectively; ::CUmemorytype_enum is defined as:
  7471. + *
  7472. + * \code
  7473. + typedef enum CUmemorytype_enum {
  7474. + CU_MEMORYTYPE_HOST = 0x01,
  7475. + CU_MEMORYTYPE_DEVICE = 0x02,
  7476. + CU_MEMORYTYPE_ARRAY = 0x03,
  7477. + CU_MEMORYTYPE_UNIFIED = 0x04
  7478. + } CUmemorytype;
  7479. + * \endcode
  7480. + *
  7481. + * \par
  7482. + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
  7483. + * specify the (host) base address of the source data and the bytes per row to
  7484. + * apply. ::srcArray is ignored.
  7485. + *
  7486. + * \par
  7487. + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
  7488. + * specify the (unified virtual address space) base address of the source data
  7489. + * and the bytes per row to apply. ::srcArray is ignored.
  7490. + * This value may be used only if unified addressing is supported in the calling
  7491. + * context.
  7492. + *
  7493. + * \par
  7494. + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
  7495. + * specify the (device) base address of the source data and the bytes per row
  7496. + * to apply. ::srcArray is ignored.
  7497. + *
  7498. + * \par
  7499. + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
  7500. + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
  7501. + * ignored.
  7502. + *
  7503. + * \par
  7504. + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
  7505. + * specify the (unified virtual address space) base address of the source data
  7506. + * and the bytes per row to apply. ::dstArray is ignored.
  7507. + * This value may be used only if unified addressing is supported in the calling
  7508. + * context.
  7509. + *
  7510. + * \par
  7511. + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
  7512. + * specify the (host) base address of the destination data and the bytes per
  7513. + * row to apply. ::dstArray is ignored.
  7514. + *
  7515. + * \par
  7516. + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
  7517. + * specify the (device) base address of the destination data and the bytes per
  7518. + * row to apply. ::dstArray is ignored.
  7519. + *
  7520. + * \par
  7521. + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
  7522. + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
  7523. + * ignored.
  7524. + *
  7525. + * - ::srcXInBytes and ::srcY specify the base address of the source data for
  7526. + * the copy.
  7527. + *
  7528. + * \par
  7529. + * For host pointers, the starting address is
  7530. + * \code
  7531. + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
  7532. + * \endcode
  7533. + *
  7534. + * \par
  7535. + * For device pointers, the starting address is
  7536. + * \code
  7537. + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
  7538. + * \endcode
  7539. + *
  7540. + * \par
  7541. + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
  7542. + * element size.
  7543. + *
  7544. + * - ::dstXInBytes and ::dstY specify the base address of the destination data
  7545. + * for the copy.
  7546. + *
  7547. + * \par
  7548. + * For host pointers, the base address is
  7549. + * \code
  7550. + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
  7551. + * \endcode
  7552. + *
  7553. + * \par
  7554. + * For device pointers, the starting address is
  7555. + * \code
  7556. + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
  7557. + * \endcode
  7558. + *
  7559. + * \par
  7560. + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
  7561. + * element size.
  7562. + *
  7563. + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
  7564. + * the 2D copy being performed.
  7565. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
  7566. + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
  7567. + * ::WidthInBytes + dstXInBytes.
  7568. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
  7569. + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
  7570. + * ::WidthInBytes + dstXInBytes.
  7571. + * - If specified, ::srcHeight must be greater than or equal to ::Height +
  7572. + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
  7573. + *
  7574. + * \par
  7575. + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
  7576. + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
  7577. + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
  7578. + * (device to device, CUDA array to device, CUDA array to CUDA array),
  7579. + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
  7580. + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
  7581. + * significantly slower in the cases where ::cuMemcpy2D() would have returned
  7582. + * an error code.
  7583. + *
  7584. + * ::cuMemcpy2DAsync() is asynchronous and can optionally be associated to a
  7585. + * stream by passing a non-zero \p hStream argument. It only works on
  7586. + * page-locked host memory and returns an error if a pointer to pageable
  7587. + * memory is passed as input.
  7588. + *
  7589. + * \param pCopy - Parameters for the memory copy
  7590. + * \param hStream - Stream identifier
  7591. + *
  7592. + * \return
  7593. + * ::CUDA_SUCCESS,
  7594. + * ::CUDA_ERROR_DEINITIALIZED,
  7595. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7596. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7597. + * ::CUDA_ERROR_INVALID_VALUE
  7598. + * \notefnerr
  7599. + *
  7600. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7601. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7602. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
  7603. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7604. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7605. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7606. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7607. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7608. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7609. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7610. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7611. + * ::cuMemsetD32, ::cuMemsetD32Async
  7612. + */
  7613. +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
  7614. +
  7615. +/**
  7616. + * \brief Copies memory for 3D arrays
  7617. + *
  7618. + * Perform a 3D memory copy according to the parameters specified in
  7619. + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
  7620. + *
  7621. + * \code
  7622. + typedef struct CUDA_MEMCPY3D_st {
  7623. +
  7624. + unsigned int srcXInBytes, srcY, srcZ;
  7625. + unsigned int srcLOD;
  7626. + CUmemorytype srcMemoryType;
  7627. + const void *srcHost;
  7628. + CUdeviceptr srcDevice;
  7629. + CUarray srcArray;
  7630. + unsigned int srcPitch; // ignored when src is array
  7631. + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
  7632. +
  7633. + unsigned int dstXInBytes, dstY, dstZ;
  7634. + unsigned int dstLOD;
  7635. + CUmemorytype dstMemoryType;
  7636. + void *dstHost;
  7637. + CUdeviceptr dstDevice;
  7638. + CUarray dstArray;
  7639. + unsigned int dstPitch; // ignored when dst is array
  7640. + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
  7641. +
  7642. + unsigned int WidthInBytes;
  7643. + unsigned int Height;
  7644. + unsigned int Depth;
  7645. + } CUDA_MEMCPY3D;
  7646. + * \endcode
  7647. + * where:
  7648. + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
  7649. + * source and destination, respectively; ::CUmemorytype_enum is defined as:
  7650. + *
  7651. + * \code
  7652. + typedef enum CUmemorytype_enum {
  7653. + CU_MEMORYTYPE_HOST = 0x01,
  7654. + CU_MEMORYTYPE_DEVICE = 0x02,
  7655. + CU_MEMORYTYPE_ARRAY = 0x03,
  7656. + CU_MEMORYTYPE_UNIFIED = 0x04
  7657. + } CUmemorytype;
  7658. + * \endcode
  7659. + *
  7660. + * \par
  7661. + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
  7662. + * specify the (unified virtual address space) base address of the source data
  7663. + * and the bytes per row to apply. ::srcArray is ignored.
  7664. + * This value may be used only if unified addressing is supported in the calling
  7665. + * context.
  7666. + *
  7667. + * \par
  7668. + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
  7669. + * ::srcHeight specify the (host) base address of the source data, the bytes
  7670. + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
  7671. + * ignored.
  7672. + *
  7673. + * \par
  7674. + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
  7675. + * ::srcHeight specify the (device) base address of the source data, the bytes
  7676. + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
  7677. + * ignored.
  7678. + *
  7679. + * \par
  7680. + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
  7681. + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
  7682. + * ::srcHeight are ignored.
  7683. + *
  7684. + * \par
  7685. + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
  7686. + * specify the (unified virtual address space) base address of the source data
  7687. + * and the bytes per row to apply. ::dstArray is ignored.
  7688. + * This value may be used only if unified addressing is supported in the calling
  7689. + * context.
  7690. + *
  7691. + * \par
  7692. + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
  7693. + * specify the (host) base address of the destination data, the bytes per row,
  7694. + * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
  7695. + *
  7696. + * \par
  7697. + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
  7698. + * specify the (device) base address of the destination data, the bytes per
  7699. + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
  7700. + *
  7701. + * \par
  7702. + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
  7703. + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
  7704. + * ::dstHeight are ignored.
  7705. + *
  7706. + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
  7707. + * data for the copy.
  7708. + *
  7709. + * \par
  7710. + * For host pointers, the starting address is
  7711. + * \code
  7712. + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
  7713. + * \endcode
  7714. + *
  7715. + * \par
  7716. + * For device pointers, the starting address is
  7717. + * \code
  7718. + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
  7719. + * \endcode
  7720. + *
  7721. + * \par
  7722. + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
  7723. + * element size.
  7724. + *
  7725. + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
  7726. + * destination data for the copy.
  7727. + *
  7728. + * \par
  7729. + * For host pointers, the base address is
  7730. + * \code
  7731. + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
  7732. + * \endcode
  7733. + *
  7734. + * \par
  7735. + * For device pointers, the starting address is
  7736. + * \code
  7737. + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
  7738. + * \endcode
  7739. + *
  7740. + * \par
  7741. + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
  7742. + * element size.
  7743. + *
  7744. + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
  7745. + * and depth of the 3D copy being performed.
  7746. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
  7747. + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
  7748. + * ::WidthInBytes + dstXInBytes.
  7749. + * - If specified, ::srcHeight must be greater than or equal to ::Height +
  7750. + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
  7751. + *
  7752. + * \par
  7753. + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
  7754. + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
  7755. + *
  7756. + * ::cuMemcpy3DAsync() is asynchronous and can optionally be associated to a
  7757. + * stream by passing a non-zero \p hStream argument. It only works on
  7758. + * page-locked host memory and returns an error if a pointer to pageable
  7759. + * memory is passed as input.
  7760. + *
  7761. + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
  7762. + * set to 0.
  7763. + *
  7764. + * \param pCopy - Parameters for the memory copy
  7765. + * \param hStream - Stream identifier
  7766. + *
  7767. + * \return
  7768. + * ::CUDA_SUCCESS,
  7769. + * ::CUDA_ERROR_DEINITIALIZED,
  7770. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7771. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7772. + * ::CUDA_ERROR_INVALID_VALUE
  7773. + * \notefnerr
  7774. + *
  7775. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7776. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7777. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7778. + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7779. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7780. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7781. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7782. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7783. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7784. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7785. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7786. + * ::cuMemsetD32, ::cuMemsetD32Async
  7787. + */
  7788. +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
  7789. +#endif /* __CUDA_API_VERSION >= 3020 */
  7790. +
  7791. +#if __CUDA_API_VERSION >= 4000
  7792. +/**
  7793. + * \brief Copies memory between contexts asynchronously.
  7794. + *
  7795. + * Perform a 3D memory copy according to the parameters specified in
  7796. + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure
  7797. + * for documentation of its parameters.
  7798. + *
  7799. + * \param pCopy - Parameters for the memory copy
  7800. + * \param hStream - Stream identifier
  7801. + *
  7802. + * \return
  7803. + * ::CUDA_SUCCESS,
  7804. + * ::CUDA_ERROR_DEINITIALIZED,
  7805. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7806. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7807. + * ::CUDA_ERROR_INVALID_VALUE
  7808. + * \notefnerr
  7809. + *
  7810. + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
  7811. + * ::cuMemcpy3DPeerAsync
  7812. + */
  7813. +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
  7814. +#endif /* __CUDA_API_VERSION >= 4000 */
  7815. +
  7816. +#if __CUDA_API_VERSION >= 3020
  7817. +/**
  7818. + * \brief Initializes device memory
  7819. + *
  7820. + * Sets the memory range of \p N 8-bit values to the specified value
  7821. + * \p uc.
  7822. + *
  7823. + * Note that this function is asynchronous with respect to the host unless
  7824. + * \p dstDevice refers to pinned host memory.
  7825. + *
  7826. + * \param dstDevice - Destination device pointer
  7827. + * \param uc - Value to set
  7828. + * \param N - Number of elements
  7829. + *
  7830. + * \return
  7831. + * ::CUDA_SUCCESS,
  7832. + * ::CUDA_ERROR_DEINITIALIZED,
  7833. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7834. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7835. + * ::CUDA_ERROR_INVALID_VALUE
  7836. + * \notefnerr
  7837. + *
  7838. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7839. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7840. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7841. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7842. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7843. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7844. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7845. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7846. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7847. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7848. + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7849. + * ::cuMemsetD32, ::cuMemsetD32Async
  7850. + */
  7851. +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
  7852. +
  7853. +/**
  7854. + * \brief Initializes device memory
  7855. + *
  7856. + * Sets the memory range of \p N 16-bit values to the specified value
  7857. + * \p us. The \p dstDevice pointer must be two byte aligned.
  7858. + *
  7859. + * Note that this function is asynchronous with respect to the host unless
  7860. + * \p dstDevice refers to pinned host memory.
  7861. + *
  7862. + * \param dstDevice - Destination device pointer
  7863. + * \param us - Value to set
  7864. + * \param N - Number of elements
  7865. + *
  7866. + * \return
  7867. + * ::CUDA_SUCCESS,
  7868. + * ::CUDA_ERROR_DEINITIALIZED,
  7869. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7870. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7871. + * ::CUDA_ERROR_INVALID_VALUE
  7872. + * \notefnerr
  7873. + *
  7874. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7875. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7876. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7877. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7878. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7879. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7880. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7881. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7882. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7883. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7884. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
  7885. + * ::cuMemsetD32, ::cuMemsetD32Async
  7886. + */
  7887. +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
  7888. +
  7889. +/**
  7890. + * \brief Initializes device memory
  7891. + *
  7892. + * Sets the memory range of \p N 32-bit values to the specified value
  7893. + * \p ui. The \p dstDevice pointer must be four byte aligned.
  7894. + *
  7895. + * Note that this function is asynchronous with respect to the host unless
  7896. + * \p dstDevice refers to pinned host memory.
  7897. + *
  7898. + * \param dstDevice - Destination device pointer
  7899. + * \param ui - Value to set
  7900. + * \param N - Number of elements
  7901. + *
  7902. + * \return
  7903. + * ::CUDA_SUCCESS,
  7904. + * ::CUDA_ERROR_DEINITIALIZED,
  7905. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7906. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7907. + * ::CUDA_ERROR_INVALID_VALUE
  7908. + * \notefnerr
  7909. + *
  7910. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7911. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7912. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7913. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7914. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7915. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7916. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7917. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7918. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  7919. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7920. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7921. + * ::cuMemsetD32Async
  7922. + */
  7923. +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
  7924. +
  7925. +/**
  7926. + * \brief Initializes device memory
  7927. + *
  7928. + * Sets the 2D memory range of \p Width 8-bit values to the specified value
  7929. + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
  7930. + * specifies the number of bytes between each row. This function performs
  7931. + * fastest when the pitch is one that has been passed back by
  7932. + * ::cuMemAllocPitch().
  7933. + *
  7934. + * Note that this function is asynchronous with respect to the host unless
  7935. + * \p dstDevice refers to pinned host memory.
  7936. + *
  7937. + * \param dstDevice - Destination device pointer
  7938. + * \param dstPitch - Pitch of destination device pointer
  7939. + * \param uc - Value to set
  7940. + * \param Width - Width of row
  7941. + * \param Height - Number of rows
  7942. + *
  7943. + * \return
  7944. + * ::CUDA_SUCCESS,
  7945. + * ::CUDA_ERROR_DEINITIALIZED,
  7946. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7947. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7948. + * ::CUDA_ERROR_INVALID_VALUE
  7949. + * \notefnerr
  7950. + *
  7951. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7952. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7953. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7954. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7955. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7956. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7957. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  7958. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  7959. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
  7960. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  7961. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  7962. + * ::cuMemsetD32, ::cuMemsetD32Async
  7963. + */
  7964. +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
  7965. +
  7966. +/**
  7967. + * \brief Initializes device memory
  7968. + *
  7969. + * Sets the 2D memory range of \p Width 16-bit values to the specified value
  7970. + * \p us. \p Height specifies the number of rows to set, and \p dstPitch
  7971. + * specifies the number of bytes between each row. The \p dstDevice pointer
  7972. + * and \p dstPitch offset must be two byte aligned. This function performs
  7973. + * fastest when the pitch is one that has been passed back by
  7974. + * ::cuMemAllocPitch().
  7975. + *
  7976. + * Note that this function is asynchronous with respect to the host unless
  7977. + * \p dstDevice refers to pinned host memory.
  7978. + *
  7979. + * \param dstDevice - Destination device pointer
  7980. + * \param dstPitch - Pitch of destination device pointer
  7981. + * \param us - Value to set
  7982. + * \param Width - Width of row
  7983. + * \param Height - Number of rows
  7984. + *
  7985. + * \return
  7986. + * ::CUDA_SUCCESS,
  7987. + * ::CUDA_ERROR_DEINITIALIZED,
  7988. + * ::CUDA_ERROR_NOT_INITIALIZED,
  7989. + * ::CUDA_ERROR_INVALID_CONTEXT,
  7990. + * ::CUDA_ERROR_INVALID_VALUE
  7991. + * \notefnerr
  7992. + *
  7993. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  7994. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  7995. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  7996. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  7997. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  7998. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  7999. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8000. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8001. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8002. + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  8003. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  8004. + * ::cuMemsetD32, ::cuMemsetD32Async
  8005. + */
  8006. +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
  8007. +
  8008. +/**
  8009. + * \brief Initializes device memory
  8010. + *
  8011. + * Sets the 2D memory range of \p Width 32-bit values to the specified value
  8012. + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
  8013. + * specifies the number of bytes between each row. The \p dstDevice pointer
  8014. + * and \p dstPitch offset must be four byte aligned. This function performs
  8015. + * fastest when the pitch is one that has been passed back by
  8016. + * ::cuMemAllocPitch().
  8017. + *
  8018. + * Note that this function is asynchronous with respect to the host unless
  8019. + * \p dstDevice refers to pinned host memory.
  8020. + *
  8021. + * \param dstDevice - Destination device pointer
  8022. + * \param dstPitch - Pitch of destination device pointer
  8023. + * \param ui - Value to set
  8024. + * \param Width - Width of row
  8025. + * \param Height - Number of rows
  8026. + *
  8027. + * \return
  8028. + * ::CUDA_SUCCESS,
  8029. + * ::CUDA_ERROR_DEINITIALIZED,
  8030. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8031. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8032. + * ::CUDA_ERROR_INVALID_VALUE
  8033. + * \notefnerr
  8034. + *
  8035. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8036. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8037. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8038. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8039. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8040. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8041. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8042. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8043. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8044. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
  8045. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  8046. + * ::cuMemsetD32, ::cuMemsetD32Async
  8047. + */
  8048. +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
  8049. +
  8050. +/**
  8051. + * \brief Sets device memory
  8052. + *
  8053. + * Sets the memory range of \p N 8-bit values to the specified value
  8054. + * \p uc.
  8055. + *
  8056. + * ::cuMemsetD8Async() is asynchronous and can optionally be associated to a
  8057. + * stream by passing a non-zero \p stream argument.
  8058. + *
  8059. + * \param dstDevice - Destination device pointer
  8060. + * \param uc - Value to set
  8061. + * \param N - Number of elements
  8062. + * \param hStream - Stream identifier
  8063. + *
  8064. + * \return
  8065. + * ::CUDA_SUCCESS,
  8066. + * ::CUDA_ERROR_DEINITIALIZED,
  8067. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8068. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8069. + * ::CUDA_ERROR_INVALID_VALUE
  8070. + * \notefnerr
  8071. + *
  8072. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8073. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8074. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8075. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8076. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8077. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8078. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8079. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8080. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8081. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  8082. + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
  8083. + * ::cuMemsetD32, ::cuMemsetD32Async
  8084. + */
  8085. +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
  8086. +
  8087. +/**
  8088. + * \brief Sets device memory
  8089. + *
  8090. + * Sets the memory range of \p N 16-bit values to the specified value
  8091. + * \p us. The \p dstDevice pointer must be two byte aligned.
  8092. + *
  8093. + * ::cuMemsetD16Async() is asynchronous and can optionally be associated to a
  8094. + * stream by passing a non-zero \p stream argument.
  8095. + *
  8096. + * \param dstDevice - Destination device pointer
  8097. + * \param us - Value to set
  8098. + * \param N - Number of elements
  8099. + * \param hStream - Stream identifier
  8100. + *
  8101. + * \return
  8102. + * ::CUDA_SUCCESS,
  8103. + * ::CUDA_ERROR_DEINITIALIZED,
  8104. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8105. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8106. + * ::CUDA_ERROR_INVALID_VALUE
  8107. + * \notefnerr
  8108. + *
  8109. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8110. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8111. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8112. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8113. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8114. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8115. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8116. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8117. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8118. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  8119. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
  8120. + * ::cuMemsetD32, ::cuMemsetD32Async
  8121. + */
  8122. +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
  8123. +
  8124. +/**
  8125. + * \brief Sets device memory
  8126. + *
  8127. + * Sets the memory range of \p N 32-bit values to the specified value
  8128. + * \p ui. The \p dstDevice pointer must be four byte aligned.
  8129. + *
  8130. + * ::cuMemsetD32Async() is asynchronous and can optionally be associated to a
  8131. + * stream by passing a non-zero \p stream argument.
  8132. + *
  8133. + * \param dstDevice - Destination device pointer
  8134. + * \param ui - Value to set
  8135. + * \param N - Number of elements
  8136. + * \param hStream - Stream identifier
  8137. + *
  8138. + * \return
  8139. + * ::CUDA_SUCCESS,
  8140. + * ::CUDA_ERROR_DEINITIALIZED,
  8141. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8142. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8143. + * ::CUDA_ERROR_INVALID_VALUE
  8144. + * \notefnerr
  8145. + *
  8146. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8147. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8148. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8149. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8150. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8151. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8152. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8153. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8154. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8155. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  8156. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32
  8157. + */
  8158. +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
  8159. +
  8160. +/**
  8161. + * \brief Sets device memory
  8162. + *
  8163. + * Sets the 2D memory range of \p Width 8-bit values to the specified value
  8164. + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
  8165. + * specifies the number of bytes between each row. This function performs
  8166. + * fastest when the pitch is one that has been passed back by
  8167. + * ::cuMemAllocPitch().
  8168. + *
  8169. + * ::cuMemsetD2D8Async() is asynchronous and can optionally be associated to a
  8170. + * stream by passing a non-zero \p stream argument.
  8171. + *
  8172. + * \param dstDevice - Destination device pointer
  8173. + * \param dstPitch - Pitch of destination device pointer
  8174. + * \param uc - Value to set
  8175. + * \param Width - Width of row
  8176. + * \param Height - Number of rows
  8177. + * \param hStream - Stream identifier
  8178. + *
  8179. + * \return
  8180. + * ::CUDA_SUCCESS,
  8181. + * ::CUDA_ERROR_DEINITIALIZED,
  8182. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8183. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8184. + * ::CUDA_ERROR_INVALID_VALUE
  8185. + * \notefnerr
  8186. + *
  8187. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8188. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8189. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8190. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8191. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8192. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8193. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8194. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8195. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
  8196. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  8197. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  8198. + * ::cuMemsetD32, ::cuMemsetD32Async
  8199. + */
  8200. +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
  8201. +
  8202. +/**
  8203. + * \brief Sets device memory
  8204. + *
  8205. + * Sets the 2D memory range of \p Width 16-bit values to the specified value
  8206. + * \p us. \p Height specifies the number of rows to set, and \p dstPitch
  8207. + * specifies the number of bytes between each row. The \p dstDevice pointer
  8208. + * and \p dstPitch offset must be two byte aligned. This function performs
  8209. + * fastest when the pitch is one that has been passed back by
  8210. + * ::cuMemAllocPitch().
  8211. + *
  8212. + * ::cuMemsetD2D16Async() is asynchronous and can optionally be associated to a
  8213. + * stream by passing a non-zero \p stream argument.
  8214. + *
  8215. + * \param dstDevice - Destination device pointer
  8216. + * \param dstPitch - Pitch of destination device pointer
  8217. + * \param us - Value to set
  8218. + * \param Width - Width of row
  8219. + * \param Height - Number of rows
  8220. + * \param hStream - Stream identifier
  8221. + *
  8222. + * \return
  8223. + * ::CUDA_SUCCESS,
  8224. + * ::CUDA_ERROR_DEINITIALIZED,
  8225. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8226. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8227. + * ::CUDA_ERROR_INVALID_VALUE
  8228. + * \notefnerr
  8229. + *
  8230. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8231. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8232. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8233. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8234. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8235. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8236. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8237. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8238. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8239. + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
  8240. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  8241. + * ::cuMemsetD32, ::cuMemsetD32Async
  8242. + */
  8243. +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
  8244. +
  8245. +/**
  8246. + * \brief Sets device memory
  8247. + *
  8248. + * Sets the 2D memory range of \p Width 32-bit values to the specified value
  8249. + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
  8250. + * specifies the number of bytes between each row. The \p dstDevice pointer
  8251. + * and \p dstPitch offset must be four byte aligned. This function performs
  8252. + * fastest when the pitch is one that has been passed back by
  8253. + * ::cuMemAllocPitch().
  8254. + *
  8255. + * ::cuMemsetD2D32Async() is asynchronous and can optionally be associated to a
  8256. + * stream by passing a non-zero \p stream argument.
  8257. + *
  8258. + * \param dstDevice - Destination device pointer
  8259. + * \param dstPitch - Pitch of destination device pointer
  8260. + * \param ui - Value to set
  8261. + * \param Width - Width of row
  8262. + * \param Height - Number of rows
  8263. + * \param hStream - Stream identifier
  8264. + *
  8265. + * \return
  8266. + * ::CUDA_SUCCESS,
  8267. + * ::CUDA_ERROR_DEINITIALIZED,
  8268. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8269. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8270. + * ::CUDA_ERROR_INVALID_VALUE
  8271. + * \notefnerr
  8272. + *
  8273. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8274. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8275. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8276. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8277. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8278. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8279. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8280. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8281. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
  8282. + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
  8283. + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
  8284. + * ::cuMemsetD32, ::cuMemsetD32Async
  8285. + */
  8286. +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
  8287. +
  8288. +/**
  8289. + * \brief Creates a 1D or 2D CUDA array
  8290. + *
  8291. + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
  8292. + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
  8293. + * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
  8294. + *
  8295. + * \code
  8296. + typedef struct {
  8297. + unsigned int Width;
  8298. + unsigned int Height;
  8299. + CUarray_format Format;
  8300. + unsigned int NumChannels;
  8301. + } CUDA_ARRAY_DESCRIPTOR;
  8302. + * \endcode
  8303. + * where:
  8304. + *
  8305. + * - \p Width, and \p Height are the width, and height of the CUDA array (in
  8306. + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
  8307. + * otherwise;
  8308. + * - ::Format specifies the format of the elements; ::CUarray_format is
  8309. + * defined as:
  8310. + * \code
  8311. + typedef enum CUarray_format_enum {
  8312. + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
  8313. + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
  8314. + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
  8315. + CU_AD_FORMAT_SIGNED_INT8 = 0x08,
  8316. + CU_AD_FORMAT_SIGNED_INT16 = 0x09,
  8317. + CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
  8318. + CU_AD_FORMAT_HALF = 0x10,
  8319. + CU_AD_FORMAT_FLOAT = 0x20
  8320. + } CUarray_format;
  8321. + * \endcode
  8322. + * - \p NumChannels specifies the number of packed components per CUDA array
  8323. + * element; it may be 1, 2, or 4;
  8324. + *
  8325. + * Here are examples of CUDA array descriptions:
  8326. + *
  8327. + * Description for a CUDA array of 2048 floats:
  8328. + * \code
  8329. + CUDA_ARRAY_DESCRIPTOR desc;
  8330. + desc.Format = CU_AD_FORMAT_FLOAT;
  8331. + desc.NumChannels = 1;
  8332. + desc.Width = 2048;
  8333. + desc.Height = 1;
  8334. + * \endcode
  8335. + *
  8336. + * Description for a 64 x 64 CUDA array of floats:
  8337. + * \code
  8338. + CUDA_ARRAY_DESCRIPTOR desc;
  8339. + desc.Format = CU_AD_FORMAT_FLOAT;
  8340. + desc.NumChannels = 1;
  8341. + desc.Width = 64;
  8342. + desc.Height = 64;
  8343. + * \endcode
  8344. + *
  8345. + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
  8346. + * float16's:
  8347. + * \code
  8348. + CUDA_ARRAY_DESCRIPTOR desc;
  8349. + desc.FormatFlags = CU_AD_FORMAT_HALF;
  8350. + desc.NumChannels = 4;
  8351. + desc.Width = width;
  8352. + desc.Height = height;
  8353. + * \endcode
  8354. + *
  8355. + * Description for a \p width x \p height CUDA array of 16-bit elements, each
  8356. + * of which is two 8-bit unsigned chars:
  8357. + * \code
  8358. + CUDA_ARRAY_DESCRIPTOR arrayDesc;
  8359. + desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
  8360. + desc.NumChannels = 2;
  8361. + desc.Width = width;
  8362. + desc.Height = height;
  8363. + * \endcode
  8364. + *
  8365. + * \param pHandle - Returned array
  8366. + * \param pAllocateArray - Array descriptor
  8367. + *
  8368. + * \return
  8369. + * ::CUDA_SUCCESS,
  8370. + * ::CUDA_ERROR_DEINITIALIZED,
  8371. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8372. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8373. + * ::CUDA_ERROR_INVALID_VALUE,
  8374. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  8375. + * ::CUDA_ERROR_UNKNOWN
  8376. + * \notefnerr
  8377. + *
  8378. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
  8379. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8380. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8381. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8382. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8383. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8384. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8385. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8386. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  8387. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  8388. + */
  8389. +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
  8390. +
  8391. +/**
  8392. + * \brief Get a 1D or 2D CUDA array descriptor
  8393. + *
  8394. + * Returns in \p *pArrayDescriptor a descriptor containing information on the
  8395. + * format and dimensions of the CUDA array \p hArray. It is useful for
  8396. + * subroutines that have been passed a CUDA array, but need to know the CUDA
  8397. + * array parameters for validation or other purposes.
  8398. + *
  8399. + * \param pArrayDescriptor - Returned array descriptor
  8400. + * \param hArray - Array to get descriptor of
  8401. + *
  8402. + * \return
  8403. + * ::CUDA_SUCCESS,
  8404. + * ::CUDA_ERROR_DEINITIALIZED,
  8405. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8406. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8407. + * ::CUDA_ERROR_INVALID_VALUE,
  8408. + * ::CUDA_ERROR_INVALID_HANDLE
  8409. + * \notefnerr
  8410. + *
  8411. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8412. + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
  8413. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8414. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8415. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8416. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8417. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8418. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8419. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  8420. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  8421. + */
  8422. +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
  8423. +#endif /* __CUDA_API_VERSION >= 3020 */
  8424. +
  8425. +
  8426. +/**
  8427. + * \brief Destroys a CUDA array
  8428. + *
  8429. + * Destroys the CUDA array \p hArray.
  8430. + *
  8431. + * \param hArray - Array to destroy
  8432. + *
  8433. + * \return
  8434. + * ::CUDA_SUCCESS,
  8435. + * ::CUDA_ERROR_DEINITIALIZED,
  8436. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8437. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8438. + * ::CUDA_ERROR_INVALID_HANDLE,
  8439. + * ::CUDA_ERROR_ARRAY_IS_MAPPED
  8440. + * \notefnerr
  8441. + *
  8442. + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8443. + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8444. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8445. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8446. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8447. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8448. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8449. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8450. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  8451. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  8452. + */
  8453. +CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
  8454. +
  8455. +#if __CUDA_API_VERSION >= 3020
  8456. +/**
  8457. + * \brief Creates a 3D CUDA array
  8458. + *
  8459. + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
  8460. + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
  8461. + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
  8462. + *
  8463. + * \code
  8464. + typedef struct {
  8465. + unsigned int Width;
  8466. + unsigned int Height;
  8467. + unsigned int Depth;
  8468. + CUarray_format Format;
  8469. + unsigned int NumChannels;
  8470. + unsigned int Flags;
  8471. + } CUDA_ARRAY3D_DESCRIPTOR;
  8472. + * \endcode
  8473. + * where:
  8474. + *
  8475. + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
  8476. + * CUDA array (in elements); the following types of CUDA arrays can be allocated:
  8477. + * - A 1D array is allocated if \p Height and \p Depth extents are both zero.
  8478. + * - A 2D array is allocated if only \p Depth extent is zero.
  8479. + * - A 3D array is allocated if all three extents are non-zero.
  8480. + * - A 1D layered CUDA array is allocated if only \p Height is zero and the
  8481. + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
  8482. + * of layers is determined by the depth extent.
  8483. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and
  8484. + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
  8485. + * of layers is determined by the depth extent.
  8486. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the
  8487. + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
  8488. + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
  8489. + * where the six layers represent the six faces of a cube. The order of the six
  8490. + * layers in memory is the same as that listed in ::CUarray_cubemap_face.
  8491. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero,
  8492. + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
  8493. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
  8494. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that
  8495. + * consists of a collection of cubemaps. The first six layers represent the first
  8496. + * cubemap, the next six layers form the second cubemap, and so on.
  8497. + *
  8498. + * - ::Format specifies the format of the elements; ::CUarray_format is
  8499. + * defined as:
  8500. + * \code
  8501. + typedef enum CUarray_format_enum {
  8502. + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
  8503. + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
  8504. + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
  8505. + CU_AD_FORMAT_SIGNED_INT8 = 0x08,
  8506. + CU_AD_FORMAT_SIGNED_INT16 = 0x09,
  8507. + CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
  8508. + CU_AD_FORMAT_HALF = 0x10,
  8509. + CU_AD_FORMAT_FLOAT = 0x20
  8510. + } CUarray_format;
  8511. + * \endcode
  8512. + *
  8513. + * - \p NumChannels specifies the number of packed components per CUDA array
  8514. + * element; it may be 1, 2, or 4;
  8515. + *
  8516. + * - ::Flags may be set to
  8517. + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
  8518. + * \p Depth specifies the number of layers, not the depth of a 3D array.
  8519. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
  8520. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
  8521. + * to a surface reference.
  8522. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
  8523. + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
  8524. + * then \p Depth must be a multiple of six.
  8525. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
  8526. + * Texture gather can only be performed on 2D CUDA arrays.
  8527. + *
  8528. + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
  8529. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
  8530. + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
  8531. + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
  8532. + *
  8533. + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
  8534. + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
  8535. + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
  8536. + *
  8537. + * <table>
  8538. + * <tr><td><b>CUDA array type</b></td>
  8539. + * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
  8540. + * (depth range)}</b></td>
  8541. + * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
  8542. + * {(width range in elements), (height range), (depth range)}</b></td></tr>
  8543. + * <tr><td>1D</td>
  8544. + * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
  8545. + * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
  8546. + * <tr><td>2D</td>
  8547. + * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
  8548. + * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
  8549. + * <tr><td>3D</td>
  8550. + * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
  8551. + * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
  8552. + * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
  8553. + * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
  8554. + * (1,SURFACE3D_DEPTH) }</small></td></tr>
  8555. + * <tr><td>1D Layered</td>
  8556. + * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
  8557. + * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
  8558. + * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
  8559. + * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
  8560. + * <tr><td>2D Layered</td>
  8561. + * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
  8562. + * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
  8563. + * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
  8564. + * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
  8565. + * <tr><td>Cubemap</td>
  8566. + * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
  8567. + * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
  8568. + * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
  8569. + * <tr><td>Cubemap Layered</td>
  8570. + * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
  8571. + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
  8572. + * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
  8573. + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
  8574. + * </table>
  8575. + *
  8576. + * Here are examples of CUDA array descriptions:
  8577. + *
  8578. + * Description for a CUDA array of 2048 floats:
  8579. + * \code
  8580. + CUDA_ARRAY3D_DESCRIPTOR desc;
  8581. + desc.Format = CU_AD_FORMAT_FLOAT;
  8582. + desc.NumChannels = 1;
  8583. + desc.Width = 2048;
  8584. + desc.Height = 0;
  8585. + desc.Depth = 0;
  8586. + * \endcode
  8587. + *
  8588. + * Description for a 64 x 64 CUDA array of floats:
  8589. + * \code
  8590. + CUDA_ARRAY3D_DESCRIPTOR desc;
  8591. + desc.Format = CU_AD_FORMAT_FLOAT;
  8592. + desc.NumChannels = 1;
  8593. + desc.Width = 64;
  8594. + desc.Height = 64;
  8595. + desc.Depth = 0;
  8596. + * \endcode
  8597. + *
  8598. + * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
  8599. + * 4x16-bit float16's:
  8600. + * \code
  8601. + CUDA_ARRAY3D_DESCRIPTOR desc;
  8602. + desc.FormatFlags = CU_AD_FORMAT_HALF;
  8603. + desc.NumChannels = 4;
  8604. + desc.Width = width;
  8605. + desc.Height = height;
  8606. + desc.Depth = depth;
  8607. + * \endcode
  8608. + *
  8609. + * \param pHandle - Returned array
  8610. + * \param pAllocateArray - 3D array descriptor
  8611. + *
  8612. + * \return
  8613. + * ::CUDA_SUCCESS,
  8614. + * ::CUDA_ERROR_DEINITIALIZED,
  8615. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8616. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8617. + * ::CUDA_ERROR_INVALID_VALUE,
  8618. + * ::CUDA_ERROR_OUT_OF_MEMORY,
  8619. + * ::CUDA_ERROR_UNKNOWN
  8620. + * \notefnerr
  8621. + *
  8622. + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
  8623. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8624. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8625. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8626. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8627. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8628. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8629. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8630. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  8631. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  8632. + */
  8633. +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
  8634. +
  8635. +/**
  8636. + * \brief Get a 3D CUDA array descriptor
  8637. + *
  8638. + * Returns in \p *pArrayDescriptor a descriptor containing information on the
  8639. + * format and dimensions of the CUDA array \p hArray. It is useful for
  8640. + * subroutines that have been passed a CUDA array, but need to know the CUDA
  8641. + * array parameters for validation or other purposes.
  8642. + *
  8643. + * This function may be called on 1D and 2D arrays, in which case the \p Height
  8644. + * and/or \p Depth members of the descriptor struct will be set to 0.
  8645. + *
  8646. + * \param pArrayDescriptor - Returned 3D array descriptor
  8647. + * \param hArray - 3D array to get descriptor of
  8648. + *
  8649. + * \return
  8650. + * ::CUDA_SUCCESS,
  8651. + * ::CUDA_ERROR_DEINITIALIZED,
  8652. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8653. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8654. + * ::CUDA_ERROR_INVALID_VALUE,
  8655. + * ::CUDA_ERROR_INVALID_HANDLE
  8656. + * \notefnerr
  8657. + *
  8658. + * \sa ::cuArray3DCreate, ::cuArrayCreate,
  8659. + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
  8660. + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
  8661. + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
  8662. + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
  8663. + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
  8664. + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
  8665. + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
  8666. + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
  8667. + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
  8668. + */
  8669. +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
  8670. +#endif /* __CUDA_API_VERSION >= 3020 */
  8671. +
  8672. +/** @} */ /* END CUDA_MEM */
  8673. +
  8674. +/**
  8675. + * \defgroup CUDA_UNIFIED Unified Addressing
  8676. + *
  8677. + * This section describes the unified addressing functions of the
  8678. + * low-level CUDA driver application programming interface.
  8679. + *
  8680. + * @{
  8681. + *
  8682. + * \section CUDA_UNIFIED_overview Overview
  8683. + *
  8684. + * CUDA devices can share a unified address space with the host.
  8685. + * For these devices there is no distinction between a device
  8686. + * pointer and a host pointer -- the same pointer value may be
  8687. + * used to access memory from the host program and from a kernel
  8688. + * running on the device (with exceptions enumerated below).
  8689. + *
  8690. + * \section CUDA_UNIFIED_support Supported Platforms
  8691. + *
  8692. + * Whether or not a device supports unified addressing may be
  8693. + * queried by calling ::cuDeviceGetAttribute() with the device
  8694. + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
  8695. + *
  8696. + * Unified addressing is automatically enabled in 64-bit processes
  8697. + * on devices with compute capability greater than or equal to 2.0.
  8698. + *
  8699. + * Unified addressing is not yet supported on Windows Vista or
  8700. + * Windows 7 for devices that do not use the TCC driver model.
  8701. + *
  8702. + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
  8703. + *
  8704. + * It is possible to look up information about the memory which backs a
  8705. + * pointer value. For instance, one may want to know if a pointer points
  8706. + * to host or device memory. As another example, in the case of device
  8707. + * memory, one may want to know on which CUDA device the memory
  8708. + * resides. These properties may be queried using the function
  8709. + * ::cuPointerGetAttribute()
  8710. + *
  8711. + * Since pointers are unique, it is not necessary to specify information
  8712. + * about the pointers specified to the various copy functions in the
  8713. + * CUDA API. The function ::cuMemcpy() may be used to perform a copy
  8714. + * between two pointers, ignoring whether they point to host or device
  8715. + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
  8716. + * unnecessary for devices supporting unified addressing). For
  8717. + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
  8718. + * used to specify that the CUDA driver should infer the location of the
  8719. + * pointer from its value.
  8720. + *
  8721. + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
  8722. + *
  8723. + * All host memory allocated in all contexts using ::cuMemAllocHost() and
  8724. + * ::cuMemHostAlloc() is always directly accessible from all contexts on
  8725. + * all devices that support unified addressing. This is the case regardless
  8726. + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
  8727. + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
  8728. + *
  8729. + * The pointer value through which allocated host memory may be accessed
  8730. + * in kernels on all devices that support unified addressing is the same
  8731. + * as the pointer value through which that memory is accessed on the host,
  8732. + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
  8733. + * pointer for these allocations.
  8734. + *
  8735. + * Note that this is not the case for memory allocated using the flag
  8736. + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
  8737. + *
  8738. + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
  8739. + *
  8740. + * Upon enabling direct access from a context that supports unified addressing
  8741. + * to another peer context that supports unified addressing using
  8742. + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
  8743. + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
  8744. + * by the current context. The device pointer value through
  8745. + * which any peer memory may be accessed in the current context
  8746. + * is the same pointer value through which that memory may be
  8747. + * accessed in the peer context.
  8748. + *
  8749. + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
  8750. + *
  8751. + * Not all memory may be accessed on devices through the same pointer
  8752. + * value through which they are accessed on the host. These exceptions
  8753. + * are host memory registered using ::cuMemHostRegister() and host memory
  8754. + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these
  8755. + * exceptions, there exists a distinct host and device address for the
  8756. + * memory. The device address is guaranteed to not overlap any valid host
  8757. + * pointer range and is guaranteed to have the same value across all
  8758. + * contexts that support unified addressing.
  8759. + *
  8760. + * This device address may be queried using ::cuMemHostGetDevicePointer()
  8761. + * when a context using unified addressing is current. Either the host
  8762. + * or the unified device pointer value may be used to refer to this memory
  8763. + * through ::cuMemcpy() and similar functions using the
  8764. + * ::CU_MEMORYTYPE_UNIFIED memory type.
  8765. + *
  8766. + */
  8767. +
  8768. +#if __CUDA_API_VERSION >= 4000
  8769. +/**
  8770. + * \brief Returns information about a pointer
  8771. + *
  8772. + * The supported attributes are:
  8773. + *
  8774. + * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
  8775. + *
  8776. + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or
  8777. + * registered.
  8778. + * The type of \p data must be ::CUcontext *.
  8779. + *
  8780. + * If \p ptr was not allocated by, mapped by, or registered with
  8781. + * a ::CUcontext which uses unified virtual addressing then
  8782. + * ::CUDA_ERROR_INVALID_VALUE is returned.
  8783. + *
  8784. + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
  8785. + *
  8786. + * Returns in \p *data the physical memory type of the memory that
  8787. + * \p ptr addresses as a ::CUmemorytype enumerated value.
  8788. + * The type of \p data must be unsigned int.
  8789. + *
  8790. + * If \p ptr addresses device memory then \p *data is set to
  8791. + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the
  8792. + * memory resides is the ::CUdevice of the ::CUcontext returned by the
  8793. + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
  8794. + *
  8795. + * If \p ptr addresses host memory then \p *data is set to
  8796. + * ::CU_MEMORYTYPE_HOST.
  8797. + *
  8798. + * If \p ptr was not allocated by, mapped by, or registered with
  8799. + * a ::CUcontext which uses unified virtual addressing then
  8800. + * ::CUDA_ERROR_INVALID_VALUE is returned.
  8801. + *
  8802. + * If the current ::CUcontext does not support unified virtual
  8803. + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
  8804. + *
  8805. + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
  8806. + *
  8807. + * Returns in \p *data the device pointer value through which
  8808. + * \p ptr may be accessed by kernels running in the current
  8809. + * ::CUcontext.
  8810. + * The type of \p data must be CUdeviceptr *.
  8811. + *
  8812. + * If there exists no device pointer value through which
  8813. + * kernels running in the current ::CUcontext may access
  8814. + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
  8815. + *
  8816. + * If there is no current ::CUcontext then
  8817. + * ::CUDA_ERROR_INVALID_CONTEXT is returned.
  8818. + *
  8819. + * Except in the exceptional disjoint addressing cases discussed
  8820. + * below, the value returned in \p *data will equal the input
  8821. + * value \p ptr.
  8822. + *
  8823. + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
  8824. + *
  8825. + * Returns in \p *data the host pointer value through which
  8826. + * \p ptr may be accessed by by the host program.
  8827. + * The type of \p data must be void **.
  8828. + * If there exists no host pointer value through which
  8829. + * the host program may directly access \p ptr then
  8830. + * ::CUDA_ERROR_INVALID_VALUE is returned.
  8831. + *
  8832. + * Except in the exceptional disjoint addressing cases discussed
  8833. + * below, the value returned in \p *data will equal the input
  8834. + * value \p ptr.
  8835. + *
  8836. + *
  8837. + * \par
  8838. + *
  8839. + * Note that for most allocations in the unified virtual address space
  8840. + * the host and device pointer for accessing the allocation will be the
  8841. + * same. The exceptions to this are
  8842. + * - user memory registered using ::cuMemHostRegister
  8843. + * - host memory allocated using ::cuMemHostAlloc with the
  8844. + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag
  8845. + * For these types of allocation there will exist separate, disjoint host
  8846. + * and device addresses for accessing the allocation. In particular
  8847. + * - The host address will correspond to an invalid unmapped device address
  8848. + * (which will result in an exception if accessed from the device)
  8849. + * - The device address will correspond to an invalid unmapped host address
  8850. + * (which will result in an exception if accessed from the host).
  8851. + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
  8852. + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
  8853. + * and device addresses from either address.
  8854. + *
  8855. + * \param data - Returned pointer attribute value
  8856. + * \param attribute - Pointer attribute to query
  8857. + * \param ptr - Pointer
  8858. + *
  8859. + * \return
  8860. + * ::CUDA_SUCCESS,
  8861. + * ::CUDA_ERROR_DEINITIALIZED,
  8862. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8863. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8864. + * ::CUDA_ERROR_INVALID_VALUE,
  8865. + * ::CUDA_ERROR_INVALID_DEVICE
  8866. + * \notefnerr
  8867. + *
  8868. + * \sa ::cuMemAlloc,
  8869. + * ::cuMemFree,
  8870. + * ::cuMemAllocHost,
  8871. + * ::cuMemFreeHost,
  8872. + * ::cuMemHostAlloc,
  8873. + * ::cuMemHostRegister,
  8874. + * ::cuMemHostUnregister
  8875. + */
  8876. +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
  8877. +#endif /* __CUDA_API_VERSION >= 4000 */
  8878. +
  8879. +/** @} */ /* END CUDA_UNIFIED */
  8880. +
  8881. +/**
  8882. + * \defgroup CUDA_STREAM Stream Management
  8883. + *
  8884. + * This section describes the stream management functions of the low-level CUDA
  8885. + * driver application programming interface.
  8886. + *
  8887. + * @{
  8888. + */
  8889. +
  8890. +/**
  8891. + * \brief Create a stream
  8892. + *
  8893. + * Creates a stream and returns a handle in \p phStream. \p Flags is required
  8894. + * to be 0.
  8895. + *
  8896. + * \param phStream - Returned newly created stream
  8897. + * \param Flags - Parameters for stream creation (must be 0)
  8898. + *
  8899. + * \return
  8900. + * ::CUDA_SUCCESS,
  8901. + * ::CUDA_ERROR_DEINITIALIZED,
  8902. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8903. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8904. + * ::CUDA_ERROR_INVALID_VALUE,
  8905. + * ::CUDA_ERROR_OUT_OF_MEMORY
  8906. + * \notefnerr
  8907. + *
  8908. + * \sa ::cuStreamDestroy,
  8909. + * ::cuStreamWaitEvent,
  8910. + * ::cuStreamQuery,
  8911. + * ::cuStreamSynchronize
  8912. + */
  8913. +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
  8914. +
  8915. +/**
  8916. + * \brief Make a compute stream wait on an event
  8917. + *
  8918. + * Makes all future work submitted to \p hStream wait until \p hEvent
  8919. + * reports completion before beginning execution. This synchronization
  8920. + * will be performed efficiently on the device. The event \p hEvent may
  8921. + * be from a different context than \p hStream, in which case this function
  8922. + * will perform cross-device synchronization.
  8923. + *
  8924. + * The stream \p hStream will wait only for the completion of the most recent
  8925. + * host call to ::cuEventRecord() on \p hEvent. Once this call has returned,
  8926. + * any functions (including ::cuEventRecord() and ::cuEventDestroy()) may be
  8927. + * called on \p hEvent again, and subsequent calls will not have any
  8928. + * effect on \p hStream.
  8929. + *
  8930. + * If \p hStream is 0 (the NULL stream) any future work submitted in any stream
  8931. + * will wait for \p hEvent to complete before beginning execution. This
  8932. + * effectively creates a barrier for all future work submitted to the context.
  8933. + *
  8934. + * If ::cuEventRecord() has not been called on \p hEvent, this call acts as if
  8935. + * the record has already completed, and so is a functional no-op.
  8936. + *
  8937. + * \param hStream - Stream to wait
  8938. + * \param hEvent - Event to wait on (may not be NULL)
  8939. + * \param Flags - Parameters for the operation (must be 0)
  8940. + *
  8941. + * \return
  8942. + * ::CUDA_SUCCESS,
  8943. + * ::CUDA_ERROR_DEINITIALIZED,
  8944. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8945. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8946. + * ::CUDA_ERROR_INVALID_HANDLE,
  8947. + * \notefnerr
  8948. + *
  8949. + * \sa ::cuStreamCreate,
  8950. + * ::cuEventRecord,
  8951. + * ::cuStreamQuery,
  8952. + * ::cuStreamSynchronize,
  8953. + * ::cuStreamDestroy
  8954. + */
  8955. +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
  8956. +
  8957. +/**
  8958. + * \brief Determine status of a compute stream
  8959. + *
  8960. + * Returns ::CUDA_SUCCESS if all operations in the stream specified by
  8961. + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
  8962. + *
  8963. + * \param hStream - Stream to query status of
  8964. + *
  8965. + * \return
  8966. + * ::CUDA_SUCCESS,
  8967. + * ::CUDA_ERROR_DEINITIALIZED,
  8968. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8969. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8970. + * ::CUDA_ERROR_INVALID_HANDLE,
  8971. + * ::CUDA_ERROR_NOT_READY
  8972. + * \notefnerr
  8973. + *
  8974. + * \sa ::cuStreamCreate,
  8975. + * ::cuStreamWaitEvent,
  8976. + * ::cuStreamDestroy,
  8977. + * ::cuStreamSynchronize
  8978. + */
  8979. +CUresult CUDAAPI cuStreamQuery(CUstream hStream);
  8980. +
  8981. +/**
  8982. + * \brief Wait until a stream's tasks are completed
  8983. + *
  8984. + * Waits until the device has completed all operations in the stream specified
  8985. + * by \p hStream. If the context was created with the
  8986. + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
  8987. + * stream is finished with all of its tasks.
  8988. + *
  8989. + * \param hStream - Stream to wait for
  8990. + *
  8991. + * \return
  8992. + * ::CUDA_SUCCESS,
  8993. + * ::CUDA_ERROR_DEINITIALIZED,
  8994. + * ::CUDA_ERROR_NOT_INITIALIZED,
  8995. + * ::CUDA_ERROR_INVALID_CONTEXT,
  8996. + * ::CUDA_ERROR_INVALID_HANDLE
  8997. + * \notefnerr
  8998. + *
  8999. + * \sa ::cuStreamCreate,
  9000. + * ::cuStreamDestroy,
  9001. + * ::cuStreamWaitEvent,
  9002. + * ::cuStreamQuery
  9003. + */
  9004. +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
  9005. +
  9006. +#if __CUDA_API_VERSION >= 4000
  9007. +/**
  9008. + * \brief Destroys a stream
  9009. + *
  9010. + * Destroys the stream specified by \p hStream.
  9011. + *
  9012. + * In case the device is still doing work in the stream \p hStream
  9013. + * when ::cuStreamDestroy() is called, the function will return immediately
  9014. + * and the resources associated with \p hStream will be released automatically
  9015. + * once the device has completed all work in \p hStream.
  9016. + *
  9017. + * \param hStream - Stream to destroy
  9018. + *
  9019. + * \return
  9020. + * ::CUDA_SUCCESS,
  9021. + * ::CUDA_ERROR_DEINITIALIZED,
  9022. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9023. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9024. + * ::CUDA_ERROR_INVALID_VALUE
  9025. + * \notefnerr
  9026. + *
  9027. + * \sa ::cuStreamCreate,
  9028. + * ::cuStreamWaitEvent,
  9029. + * ::cuStreamQuery,
  9030. + * ::cuStreamSynchronize
  9031. + */
  9032. +CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
  9033. +#endif /* __CUDA_API_VERSION >= 4000 */
  9034. +
  9035. +/** @} */ /* END CUDA_STREAM */
  9036. +
  9037. +
  9038. +/**
  9039. + * \defgroup CUDA_EVENT Event Management
  9040. + *
  9041. + * This section describes the event management functions of the low-level CUDA
  9042. + * driver application programming interface.
  9043. + *
  9044. + * @{
  9045. + */
  9046. +
  9047. +/**
  9048. + * \brief Creates an event
  9049. + *
  9050. + * Creates an event *phEvent with the flags specified via \p Flags. Valid flags
  9051. + * include:
  9052. + * - ::CU_EVENT_DEFAULT: Default event creation flag.
  9053. + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
  9054. + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on
  9055. + * an event created with this flag will block until the event has actually
  9056. + * been recorded.
  9057. + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
  9058. + * to record timing data. Events created with this flag specified and
  9059. + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
  9060. + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
  9061. + *
  9062. + * \param phEvent - Returns newly created event
  9063. + * \param Flags - Event creation flags
  9064. + *
  9065. + * \return
  9066. + * ::CUDA_SUCCESS,
  9067. + * ::CUDA_ERROR_DEINITIALIZED,
  9068. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9069. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9070. + * ::CUDA_ERROR_INVALID_VALUE,
  9071. + * ::CUDA_ERROR_OUT_OF_MEMORY
  9072. + * \notefnerr
  9073. + *
  9074. + * \sa
  9075. + * ::cuEventRecord,
  9076. + * ::cuEventQuery,
  9077. + * ::cuEventSynchronize,
  9078. + * ::cuEventDestroy,
  9079. + * ::cuEventElapsedTime
  9080. + */
  9081. +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
  9082. +
  9083. +/**
  9084. + * \brief Records an event
  9085. + *
  9086. + * Records an event. If \p hStream is non-zero, the event is recorded after all
  9087. + * preceding operations in \p hStream have been completed; otherwise, it is
  9088. + * recorded after all preceding operations in the CUDA context have been
  9089. + * completed. Since operation is asynchronous, ::cuEventQuery and/or
  9090. + * ::cuEventSynchronize() must be used to determine when the event has actually
  9091. + * been recorded.
  9092. + *
  9093. + * If ::cuEventRecord() has previously been called on \p hEvent, then this
  9094. + * call will overwrite any existing state in \p hEvent. Any subsequent calls
  9095. + * which examine the status of \p hEvent will only examine the completion of
  9096. + * this most recent call to ::cuEventRecord().
  9097. + *
  9098. + * It is necessary that \p hEvent and \p hStream be created on the same context.
  9099. + *
  9100. + * \param hEvent - Event to record
  9101. + * \param hStream - Stream to record event for
  9102. + *
  9103. + * \return
  9104. + * ::CUDA_SUCCESS,
  9105. + * ::CUDA_ERROR_DEINITIALIZED,
  9106. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9107. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9108. + * ::CUDA_ERROR_INVALID_HANDLE,
  9109. + * ::CUDA_ERROR_INVALID_VALUE
  9110. + * \notefnerr
  9111. + *
  9112. + * \sa ::cuEventCreate,
  9113. + * ::cuEventQuery,
  9114. + * ::cuEventSynchronize,
  9115. + * ::cuStreamWaitEvent,
  9116. + * ::cuEventDestroy,
  9117. + * ::cuEventElapsedTime
  9118. + */
  9119. +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
  9120. +
  9121. +/**
  9122. + * \brief Queries an event's status
  9123. + *
  9124. + * Query the status of all device work preceding the most recent
  9125. + * call to ::cuEventRecord() (in the appropriate compute streams,
  9126. + * as specified by the arguments to ::cuEventRecord()).
  9127. + *
  9128. + * If this work has successfully been completed by the device, or if
  9129. + * ::cuEventRecord() has not been called on \p hEvent, then ::CUDA_SUCCESS is
  9130. + * returned. If this work has not yet been completed by the device then
  9131. + * ::CUDA_ERROR_NOT_READY is returned.
  9132. + *
  9133. + * \param hEvent - Event to query
  9134. + *
  9135. + * \return
  9136. + * ::CUDA_SUCCESS,
  9137. + * ::CUDA_ERROR_DEINITIALIZED,
  9138. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9139. + * ::CUDA_ERROR_INVALID_HANDLE,
  9140. + * ::CUDA_ERROR_INVALID_VALUE,
  9141. + * ::CUDA_ERROR_NOT_READY
  9142. + * \notefnerr
  9143. + *
  9144. + * \sa ::cuEventCreate,
  9145. + * ::cuEventRecord,
  9146. + * ::cuEventSynchronize,
  9147. + * ::cuEventDestroy,
  9148. + * ::cuEventElapsedTime
  9149. + */
  9150. +CUresult CUDAAPI cuEventQuery(CUevent hEvent);
  9151. +
  9152. +/**
  9153. + * \brief Waits for an event to complete
  9154. + *
  9155. + * Wait until the completion of all device work preceding the most recent
  9156. + * call to ::cuEventRecord() (in the appropriate compute streams, as specified
  9157. + * by the arguments to ::cuEventRecord()).
  9158. + *
  9159. + * If ::cuEventRecord() has not been called on \p hEvent, ::CUDA_SUCCESS is
  9160. + * returned immediately.
  9161. + *
  9162. + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
  9163. + * flag will cause the calling CPU thread to block until the event has
  9164. + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has
  9165. + * not been set, then the CPU thread will busy-wait until the event has
  9166. + * been completed by the device.
  9167. + *
  9168. + * \param hEvent - Event to wait for
  9169. + *
  9170. + * \return
  9171. + * ::CUDA_SUCCESS,
  9172. + * ::CUDA_ERROR_DEINITIALIZED,
  9173. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9174. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9175. + * ::CUDA_ERROR_INVALID_HANDLE
  9176. + * \notefnerr
  9177. + *
  9178. + * \sa ::cuEventCreate,
  9179. + * ::cuEventRecord,
  9180. + * ::cuEventQuery,
  9181. + * ::cuEventDestroy,
  9182. + * ::cuEventElapsedTime
  9183. + */
  9184. +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
  9185. +
  9186. +#if __CUDA_API_VERSION >= 4000
  9187. +/**
  9188. + * \brief Destroys an event
  9189. + *
  9190. + * Destroys the event specified by \p hEvent.
  9191. + *
  9192. + * In case \p hEvent has been recorded but has not yet been completed
  9193. + * when ::cuEventDestroy() is called, the function will return immediately and
  9194. + * the resources associated with \p hEvent will be released automatically once
  9195. + * the device has completed \p hEvent.
  9196. + *
  9197. + * \param hEvent - Event to destroy
  9198. + *
  9199. + * \return
  9200. + * ::CUDA_SUCCESS,
  9201. + * ::CUDA_ERROR_DEINITIALIZED,
  9202. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9203. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9204. + * ::CUDA_ERROR_INVALID_HANDLE
  9205. + * \notefnerr
  9206. + *
  9207. + * \sa ::cuEventCreate,
  9208. + * ::cuEventRecord,
  9209. + * ::cuEventQuery,
  9210. + * ::cuEventSynchronize,
  9211. + * ::cuEventElapsedTime
  9212. + */
  9213. +CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
  9214. +#endif /* __CUDA_API_VERSION >= 4000 */
  9215. +
  9216. +/**
  9217. + * \brief Computes the elapsed time between two events
  9218. + *
  9219. + * Computes the elapsed time between two events (in milliseconds with a
  9220. + * resolution of around 0.5 microseconds).
  9221. + *
  9222. + * If either event was last recorded in a non-NULL stream, the resulting time
  9223. + * may be greater than expected (even if both used the same stream handle). This
  9224. + * happens because the ::cuEventRecord() operation takes place asynchronously
  9225. + * and there is no guarantee that the measured latency is actually just between
  9226. + * the two events. Any number of other different stream operations could execute
  9227. + * in between the two measured events, thus altering the timing in a significant
  9228. + * way.
  9229. + *
  9230. + * If ::cuEventRecord() has not been called on either event then
  9231. + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
  9232. + * on both events but one or both of them has not yet been completed (that is,
  9233. + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
  9234. + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
  9235. + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
  9236. + * ::CUDA_ERROR_INVALID_HANDLE.
  9237. + *
  9238. + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
  9239. + * \param hStart - Starting event
  9240. + * \param hEnd - Ending event
  9241. + *
  9242. + * \return
  9243. + * ::CUDA_SUCCESS,
  9244. + * ::CUDA_ERROR_DEINITIALIZED,
  9245. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9246. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9247. + * ::CUDA_ERROR_INVALID_HANDLE,
  9248. + * ::CUDA_ERROR_NOT_READY
  9249. + * \notefnerr
  9250. + *
  9251. + * \sa ::cuEventCreate,
  9252. + * ::cuEventRecord,
  9253. + * ::cuEventQuery,
  9254. + * ::cuEventSynchronize,
  9255. + * ::cuEventDestroy
  9256. + */
  9257. +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
  9258. +
  9259. +/** @} */ /* END CUDA_EVENT */
  9260. +
  9261. +
  9262. +/**
  9263. + * \defgroup CUDA_EXEC Execution Control
  9264. + *
  9265. + * This section describes the execution control functions of the low-level CUDA
  9266. + * driver application programming interface.
  9267. + *
  9268. + * @{
  9269. + */
  9270. +
  9271. +/**
  9272. + * \brief Returns information about a function
  9273. + *
  9274. + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
  9275. + * given by \p hfunc. The supported attributes are:
  9276. + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
  9277. + * per block, beyond which a launch of the function would fail. This number
  9278. + * depends on both the function and the device on which the function is
  9279. + * currently loaded.
  9280. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
  9281. + * statically-allocated shared memory per block required by this function.
  9282. + * This does not include dynamically-allocated shared memory requested by
  9283. + * the user at runtime.
  9284. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
  9285. + * constant memory required by this function.
  9286. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
  9287. + * used by each thread of this function.
  9288. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
  9289. + * of this function.
  9290. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
  9291. + * which the function was compiled. This value is the major PTX version * 10
  9292. + * + the minor PTX version, so a PTX version 1.3 function would return the
  9293. + * value 13. Note that this may return the undefined value of 0 for cubins
  9294. + * compiled prior to CUDA 3.0.
  9295. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
  9296. + * which the function was compiled. This value is the major binary
  9297. + * version * 10 + the minor binary version, so a binary version 1.3 function
  9298. + * would return the value 13. Note that this will return a value of 10 for
  9299. + * legacy cubins that do not have a properly-encoded binary architecture
  9300. + * version.
  9301. + *
  9302. + * \param pi - Returned attribute value
  9303. + * \param attrib - Attribute requested
  9304. + * \param hfunc - Function to query attribute of
  9305. + *
  9306. + * \return
  9307. + * ::CUDA_SUCCESS,
  9308. + * ::CUDA_ERROR_DEINITIALIZED,
  9309. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9310. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9311. + * ::CUDA_ERROR_INVALID_HANDLE,
  9312. + * ::CUDA_ERROR_INVALID_VALUE
  9313. + * \notefnerr
  9314. + *
  9315. + * \sa ::cuCtxGetCacheConfig,
  9316. + * ::cuCtxSetCacheConfig,
  9317. + * ::cuFuncSetCacheConfig,
  9318. + * ::cuLaunchKernel
  9319. + */
  9320. +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
  9321. +
  9322. +/**
  9323. + * \brief Sets the preferred cache configuration for a device function
  9324. + *
  9325. + * On devices where the L1 cache and shared memory use the same hardware
  9326. + * resources, this sets through \p config the preferred cache configuration for
  9327. + * the device function \p hfunc. This is only a preference. The driver will use
  9328. + * the requested configuration if possible, but it is free to choose a different
  9329. + * configuration if required to execute \p hfunc. Any context-wide preference
  9330. + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
  9331. + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
  9332. + * that case, the current context-wide setting will be used.
  9333. + *
  9334. + * This setting does nothing on devices where the size of the L1 cache and
  9335. + * shared memory are fixed.
  9336. + *
  9337. + * Launching a kernel with a different preference than the most recent
  9338. + * preference setting may insert a device-side synchronization point.
  9339. + *
  9340. + *
  9341. + * The supported cache configurations are:
  9342. + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
  9343. + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
  9344. + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
  9345. + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
  9346. + *
  9347. + * \param hfunc - Kernel to configure cache for
  9348. + * \param config - Requested cache configuration
  9349. + *
  9350. + * \return
  9351. + * ::CUDA_SUCCESS,
  9352. + * ::CUDA_ERROR_INVALID_VALUE,
  9353. + * ::CUDA_ERROR_DEINITIALIZED,
  9354. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9355. + * ::CUDA_ERROR_INVALID_CONTEXT
  9356. + * \notefnerr
  9357. + *
  9358. + * \sa ::cuCtxGetCacheConfig,
  9359. + * ::cuCtxSetCacheConfig,
  9360. + * ::cuFuncGetAttribute,
  9361. + * ::cuLaunchKernel
  9362. + */
  9363. +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
  9364. +
  9365. +#if __CUDA_API_VERSION >= 4020
  9366. +/**
  9367. + * \brief Sets the shared memory configuration for a device function.
  9368. + *
  9369. + * On devices with configurable shared memory banks, this function will
  9370. + * force all subsequent launches of the specified device function to have
  9371. + * the given shared memory bank size configuration. On any given launch of the
  9372. + * function, the shared memory configuration of the device will be temporarily
  9373. + * changed if needed to suit the function's preferred configuration. Changes in
  9374. + * shared memory configuration between subsequent launches of functions,
  9375. + * may introduce a device side synchronization point.
  9376. + *
  9377. + * Any per-function setting of shared memory bank size set via
  9378. + * ::cuFuncSetSharedMemConfig will override the context wide setting set with
  9379. + * ::cuCtxSetSharedMemConfig.
  9380. + *
  9381. + * Changing the shared memory bank size will not increase shared memory usage
  9382. + * or affect occupancy of kernels, but may have major effects on performance.
  9383. + * Larger bank sizes will allow for greater potential bandwidth to shared memory,
  9384. + * but will change what kinds of accesses to shared memory will result in bank
  9385. + * conflicts.
  9386. + *
  9387. + * This function will do nothing on devices with fixed shared memory bank size.
  9388. + *
  9389. + * The supported bank configurations are:
  9390. + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
  9391. + * configuration when launching this function.
  9392. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
  9393. + * be natively four bytes when launching this function.
  9394. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
  9395. + * be natively eight bytes when launching this function.
  9396. + *
  9397. + * \param hfunc - kernel to be given a shared memory config
  9398. + * \param config - requested shared memory configuration
  9399. + *
  9400. + * \return
  9401. + * ::CUDA_SUCCESS,
  9402. + * ::CUDA_ERROR_INVALID_VALUE,
  9403. + * ::CUDA_ERROR_DEINITIALIZED,
  9404. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9405. + * ::CUDA_ERROR_INVALID_CONTEXT
  9406. + * \notefnerr
  9407. + *
  9408. + * \sa ::cuCtxGetCacheConfig,
  9409. + * ::cuCtxSetCacheConfig,
  9410. + * ::cuCtxGetSharedMemConfig
  9411. + * ::cuCtxSetSharedMemConfig
  9412. + * ::cuFuncGetAttribute,
  9413. + * ::cuLaunchKernel
  9414. + */
  9415. +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
  9416. +#endif
  9417. +
  9418. +#if __CUDA_API_VERSION >= 4000
  9419. +/**
  9420. + * \brief Launches a CUDA function
  9421. + *
  9422. + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
  9423. + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
  9424. + * \p blockDimZ threads.
  9425. + *
  9426. + * \p sharedMemBytes sets the amount of dynamic shared memory that will be
  9427. + * available to each thread block.
  9428. + *
  9429. + * ::cuLaunchKernel() can optionally be associated to a stream by passing a
  9430. + * non-zero \p hStream argument.
  9431. + *
  9432. + * Kernel parameters to \p f can be specified in one of two ways:
  9433. + *
  9434. + * 1) Kernel parameters can be specified via \p kernelParams. If \p f
  9435. + * has N parameters, then \p kernelParams needs to be an array of N
  9436. + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1]
  9437. + * must point to a region of memory from which the actual kernel
  9438. + * parameter will be copied. The number of kernel parameters and their
  9439. + * offsets and sizes do not need to be specified as that information is
  9440. + * retrieved directly from the kernel's image.
  9441. + *
  9442. + * 2) Kernel parameters can also be packaged by the application into
  9443. + * a single buffer that is passed in via the \p extra parameter.
  9444. + * This places the burden on the application of knowing each kernel
  9445. + * parameter's size and alignment/padding within the buffer. Here is
  9446. + * an example of using the \p extra parameter in this manner:
  9447. + * \code
  9448. + size_t argBufferSize;
  9449. + char argBuffer[256];
  9450. +
  9451. + // populate argBuffer and argBufferSize
  9452. +
  9453. + void *config[] = {
  9454. + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
  9455. + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
  9456. + CU_LAUNCH_PARAM_END
  9457. + };
  9458. + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
  9459. + * \endcode
  9460. + *
  9461. + * The \p extra parameter exists to allow ::cuLaunchKernel to take
  9462. + * additional less commonly used arguments. \p extra specifies a list of
  9463. + * names of extra settings and their corresponding values. Each extra
  9464. + * setting name is immediately followed by the corresponding value. The
  9465. + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
  9466. + *
  9467. + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
  9468. + * array;
  9469. + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
  9470. + * value in \p extra will be a pointer to a buffer containing all
  9471. + * the kernel parameters for launching kernel \p f;
  9472. + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
  9473. + * value in \p extra will be a pointer to a size_t containing the
  9474. + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
  9475. + *
  9476. + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
  9477. + * parameters are specified with both \p kernelParams and \p extra
  9478. + * (i.e. both \p kernelParams and \p extra are non-NULL).
  9479. + *
  9480. + * Calling ::cuLaunchKernel() sets persistent function state that is
  9481. + * the same as function state set through the following deprecated APIs:
  9482. + *
  9483. + * ::cuFuncSetBlockShape()
  9484. + * ::cuFuncSetSharedSize()
  9485. + * ::cuParamSetSize()
  9486. + * ::cuParamSeti()
  9487. + * ::cuParamSetf()
  9488. + * ::cuParamSetv()
  9489. + *
  9490. + * When the kernel \p f is launched via ::cuLaunchKernel(), the previous
  9491. + * block shape, shared size and parameter info associated with \p f
  9492. + * is overwritten.
  9493. + *
  9494. + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
  9495. + * been compiled with toolchain version 3.2 or later so that it will
  9496. + * contain kernel parameter information, or have no kernel parameters.
  9497. + * If either of these conditions is not met, then ::cuLaunchKernel() will
  9498. + * return ::CUDA_ERROR_INVALID_IMAGE.
  9499. + *
  9500. + * \param f - Kernel to launch
  9501. + * \param gridDimX - Width of grid in blocks
  9502. + * \param gridDimY - Height of grid in blocks
  9503. + * \param gridDimZ - Depth of grid in blocks
  9504. + * \param blockDimX - X dimension of each thread block
  9505. + * \param blockDimY - Y dimension of each thread block
  9506. + * \param blockDimZ - Z dimension of each thread block
  9507. + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
  9508. + * \param hStream - Stream identifier
  9509. + * \param kernelParams - Array of pointers to kernel parameters
  9510. + * \param extra - Extra options
  9511. + *
  9512. + * \return
  9513. + * ::CUDA_SUCCESS,
  9514. + * ::CUDA_ERROR_DEINITIALIZED,
  9515. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9516. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9517. + * ::CUDA_ERROR_INVALID_HANDLE,
  9518. + * ::CUDA_ERROR_INVALID_IMAGE,
  9519. + * ::CUDA_ERROR_INVALID_VALUE,
  9520. + * ::CUDA_ERROR_LAUNCH_FAILED,
  9521. + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
  9522. + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
  9523. + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  9524. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  9525. + * \notefnerr
  9526. + *
  9527. + * \sa ::cuCtxGetCacheConfig,
  9528. + * ::cuCtxSetCacheConfig,
  9529. + * ::cuFuncSetCacheConfig,
  9530. + * ::cuFuncGetAttribute,
  9531. + */
  9532. +CUresult CUDAAPI cuLaunchKernel(CUfunction f,
  9533. + unsigned int gridDimX,
  9534. + unsigned int gridDimY,
  9535. + unsigned int gridDimZ,
  9536. + unsigned int blockDimX,
  9537. + unsigned int blockDimY,
  9538. + unsigned int blockDimZ,
  9539. + unsigned int sharedMemBytes,
  9540. + CUstream hStream,
  9541. + void **kernelParams,
  9542. + void **extra);
  9543. +#endif /* __CUDA_API_VERSION >= 4000 */
  9544. +
  9545. +/**
  9546. + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
  9547. + *
  9548. + * This section describes the deprecated execution control functions of the
  9549. + * low-level CUDA driver application programming interface.
  9550. + *
  9551. + * @{
  9552. + */
  9553. +
  9554. +/**
  9555. + * \brief Sets the block-dimensions for the function
  9556. + *
  9557. + * \deprecated
  9558. + *
  9559. + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
  9560. + * created when the kernel given by \p hfunc is launched.
  9561. + *
  9562. + * \param hfunc - Kernel to specify dimensions of
  9563. + * \param x - X dimension
  9564. + * \param y - Y dimension
  9565. + * \param z - Z dimension
  9566. + *
  9567. + * \return
  9568. + * ::CUDA_SUCCESS,
  9569. + * ::CUDA_ERROR_DEINITIALIZED,
  9570. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9571. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9572. + * ::CUDA_ERROR_INVALID_HANDLE,
  9573. + * ::CUDA_ERROR_INVALID_VALUE
  9574. + * \notefnerr
  9575. + *
  9576. + * \sa ::cuFuncSetSharedSize,
  9577. + * ::cuFuncSetCacheConfig,
  9578. + * ::cuFuncGetAttribute,
  9579. + * ::cuParamSetSize,
  9580. + * ::cuParamSeti,
  9581. + * ::cuParamSetf,
  9582. + * ::cuParamSetv,
  9583. + * ::cuLaunch,
  9584. + * ::cuLaunchGrid,
  9585. + * ::cuLaunchGridAsync,
  9586. + * ::cuLaunchKernel
  9587. + */
  9588. +CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
  9589. +
  9590. +/**
  9591. + * \brief Sets the dynamic shared-memory size for the function
  9592. + *
  9593. + * \deprecated
  9594. + *
  9595. + * Sets through \p bytes the amount of dynamic shared memory that will be
  9596. + * available to each thread block when the kernel given by \p hfunc is launched.
  9597. + *
  9598. + * \param hfunc - Kernel to specify dynamic shared-memory size for
  9599. + * \param bytes - Dynamic shared-memory size per thread in bytes
  9600. + *
  9601. + * \return
  9602. + * ::CUDA_SUCCESS,
  9603. + * ::CUDA_ERROR_DEINITIALIZED,
  9604. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9605. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9606. + * ::CUDA_ERROR_INVALID_HANDLE,
  9607. + * ::CUDA_ERROR_INVALID_VALUE
  9608. + * \notefnerr
  9609. + *
  9610. + * \sa ::cuFuncSetBlockShape,
  9611. + * ::cuFuncSetCacheConfig,
  9612. + * ::cuFuncGetAttribute,
  9613. + * ::cuParamSetSize,
  9614. + * ::cuParamSeti,
  9615. + * ::cuParamSetf,
  9616. + * ::cuParamSetv,
  9617. + * ::cuLaunch,
  9618. + * ::cuLaunchGrid,
  9619. + * ::cuLaunchGridAsync,
  9620. + * ::cuLaunchKernel
  9621. + */
  9622. +CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
  9623. +
  9624. +/**
  9625. + * \brief Sets the parameter size for the function
  9626. + *
  9627. + * \deprecated
  9628. + *
  9629. + * Sets through \p numbytes the total size in bytes needed by the function
  9630. + * parameters of the kernel corresponding to \p hfunc.
  9631. + *
  9632. + * \param hfunc - Kernel to set parameter size for
  9633. + * \param numbytes - Size of parameter list in bytes
  9634. + *
  9635. + * \return
  9636. + * ::CUDA_SUCCESS,
  9637. + * ::CUDA_ERROR_DEINITIALIZED,
  9638. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9639. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9640. + * ::CUDA_ERROR_INVALID_VALUE
  9641. + * \notefnerr
  9642. + *
  9643. + * \sa ::cuFuncSetBlockShape,
  9644. + * ::cuFuncSetSharedSize,
  9645. + * ::cuFuncGetAttribute,
  9646. + * ::cuParamSetf,
  9647. + * ::cuParamSeti,
  9648. + * ::cuParamSetv,
  9649. + * ::cuLaunch,
  9650. + * ::cuLaunchGrid,
  9651. + * ::cuLaunchGridAsync,
  9652. + * ::cuLaunchKernel
  9653. + */
  9654. +CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
  9655. +
  9656. +/**
  9657. + * \brief Adds an integer parameter to the function's argument list
  9658. + *
  9659. + * \deprecated
  9660. + *
  9661. + * Sets an integer parameter that will be specified the next time the
  9662. + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
  9663. + *
  9664. + * \param hfunc - Kernel to add parameter to
  9665. + * \param offset - Offset to add parameter to argument list
  9666. + * \param value - Value of parameter
  9667. + *
  9668. + * \return
  9669. + * ::CUDA_SUCCESS,
  9670. + * ::CUDA_ERROR_DEINITIALIZED,
  9671. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9672. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9673. + * ::CUDA_ERROR_INVALID_VALUE
  9674. + * \notefnerr
  9675. + *
  9676. + * \sa ::cuFuncSetBlockShape,
  9677. + * ::cuFuncSetSharedSize,
  9678. + * ::cuFuncGetAttribute,
  9679. + * ::cuParamSetSize,
  9680. + * ::cuParamSetf,
  9681. + * ::cuParamSetv,
  9682. + * ::cuLaunch,
  9683. + * ::cuLaunchGrid,
  9684. + * ::cuLaunchGridAsync,
  9685. + * ::cuLaunchKernel
  9686. + */
  9687. +CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
  9688. +
  9689. +/**
  9690. + * \brief Adds a floating-point parameter to the function's argument list
  9691. + *
  9692. + * \deprecated
  9693. + *
  9694. + * Sets a floating-point parameter that will be specified the next time the
  9695. + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
  9696. + *
  9697. + * \param hfunc - Kernel to add parameter to
  9698. + * \param offset - Offset to add parameter to argument list
  9699. + * \param value - Value of parameter
  9700. + *
  9701. + * \return
  9702. + * ::CUDA_SUCCESS,
  9703. + * ::CUDA_ERROR_DEINITIALIZED,
  9704. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9705. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9706. + * ::CUDA_ERROR_INVALID_VALUE
  9707. + * \notefnerr
  9708. + *
  9709. + * \sa ::cuFuncSetBlockShape,
  9710. + * ::cuFuncSetSharedSize,
  9711. + * ::cuFuncGetAttribute,
  9712. + * ::cuParamSetSize,
  9713. + * ::cuParamSeti,
  9714. + * ::cuParamSetv,
  9715. + * ::cuLaunch,
  9716. + * ::cuLaunchGrid,
  9717. + * ::cuLaunchGridAsync,
  9718. + * ::cuLaunchKernel
  9719. + */
  9720. +CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
  9721. +
  9722. +/**
  9723. + * \brief Adds arbitrary data to the function's argument list
  9724. + *
  9725. + * \deprecated
  9726. + *
  9727. + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
  9728. + * into the parameter space of the kernel corresponding to \p hfunc. \p offset
  9729. + * is a byte offset.
  9730. + *
  9731. + * \param hfunc - Kernel to add data to
  9732. + * \param offset - Offset to add data to argument list
  9733. + * \param ptr - Pointer to arbitrary data
  9734. + * \param numbytes - Size of data to copy in bytes
  9735. + *
  9736. + * \return
  9737. + * ::CUDA_SUCCESS,
  9738. + * ::CUDA_ERROR_DEINITIALIZED,
  9739. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9740. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9741. + * ::CUDA_ERROR_INVALID_VALUE
  9742. + * \notefnerr
  9743. + *
  9744. + * \sa ::cuFuncSetBlockShape,
  9745. + * ::cuFuncSetSharedSize,
  9746. + * ::cuFuncGetAttribute,
  9747. + * ::cuParamSetSize,
  9748. + * ::cuParamSetf,
  9749. + * ::cuParamSeti,
  9750. + * ::cuLaunch,
  9751. + * ::cuLaunchGrid,
  9752. + * ::cuLaunchGridAsync,
  9753. + * ::cuLaunchKernel
  9754. + */
  9755. +CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
  9756. +
  9757. +/**
  9758. + * \brief Launches a CUDA function
  9759. + *
  9760. + * \deprecated
  9761. + *
  9762. + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
  9763. + * contains the number of threads specified by a previous call to
  9764. + * ::cuFuncSetBlockShape().
  9765. + *
  9766. + * \param f - Kernel to launch
  9767. + *
  9768. + * \return
  9769. + * ::CUDA_SUCCESS,
  9770. + * ::CUDA_ERROR_DEINITIALIZED,
  9771. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9772. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9773. + * ::CUDA_ERROR_INVALID_VALUE,
  9774. + * ::CUDA_ERROR_LAUNCH_FAILED,
  9775. + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
  9776. + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
  9777. + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  9778. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  9779. + * \notefnerr
  9780. + *
  9781. + * \sa ::cuFuncSetBlockShape,
  9782. + * ::cuFuncSetSharedSize,
  9783. + * ::cuFuncGetAttribute,
  9784. + * ::cuParamSetSize,
  9785. + * ::cuParamSetf,
  9786. + * ::cuParamSeti,
  9787. + * ::cuParamSetv,
  9788. + * ::cuLaunchGrid,
  9789. + * ::cuLaunchGridAsync,
  9790. + * ::cuLaunchKernel
  9791. + */
  9792. +CUresult CUDAAPI cuLaunch(CUfunction f);
  9793. +
  9794. +/**
  9795. + * \brief Launches a CUDA function
  9796. + *
  9797. + * \deprecated
  9798. + *
  9799. + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
  9800. + * blocks. Each block contains the number of threads specified by a previous
  9801. + * call to ::cuFuncSetBlockShape().
  9802. + *
  9803. + * \param f - Kernel to launch
  9804. + * \param grid_width - Width of grid in blocks
  9805. + * \param grid_height - Height of grid in blocks
  9806. + *
  9807. + * \return
  9808. + * ::CUDA_SUCCESS,
  9809. + * ::CUDA_ERROR_DEINITIALIZED,
  9810. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9811. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9812. + * ::CUDA_ERROR_INVALID_VALUE,
  9813. + * ::CUDA_ERROR_LAUNCH_FAILED,
  9814. + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
  9815. + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
  9816. + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  9817. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  9818. + * \notefnerr
  9819. + *
  9820. + * \sa ::cuFuncSetBlockShape,
  9821. + * ::cuFuncSetSharedSize,
  9822. + * ::cuFuncGetAttribute,
  9823. + * ::cuParamSetSize,
  9824. + * ::cuParamSetf,
  9825. + * ::cuParamSeti,
  9826. + * ::cuParamSetv,
  9827. + * ::cuLaunch,
  9828. + * ::cuLaunchGridAsync,
  9829. + * ::cuLaunchKernel
  9830. + */
  9831. +CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
  9832. +
  9833. +/**
  9834. + * \brief Launches a CUDA function
  9835. + *
  9836. + * \deprecated
  9837. + *
  9838. + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
  9839. + * blocks. Each block contains the number of threads specified by a previous
  9840. + * call to ::cuFuncSetBlockShape().
  9841. + *
  9842. + * ::cuLaunchGridAsync() can optionally be associated to a stream by passing a
  9843. + * non-zero \p hStream argument.
  9844. + *
  9845. + * \param f - Kernel to launch
  9846. + * \param grid_width - Width of grid in blocks
  9847. + * \param grid_height - Height of grid in blocks
  9848. + * \param hStream - Stream identifier
  9849. + *
  9850. + * \return
  9851. + * ::CUDA_SUCCESS,
  9852. + * ::CUDA_ERROR_DEINITIALIZED,
  9853. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9854. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9855. + * ::CUDA_ERROR_INVALID_HANDLE,
  9856. + * ::CUDA_ERROR_INVALID_VALUE,
  9857. + * ::CUDA_ERROR_LAUNCH_FAILED,
  9858. + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
  9859. + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
  9860. + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  9861. + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  9862. + * \notefnerr
  9863. + *
  9864. + * \sa ::cuFuncSetBlockShape,
  9865. + * ::cuFuncSetSharedSize,
  9866. + * ::cuFuncGetAttribute,
  9867. + * ::cuParamSetSize,
  9868. + * ::cuParamSetf,
  9869. + * ::cuParamSeti,
  9870. + * ::cuParamSetv,
  9871. + * ::cuLaunch,
  9872. + * ::cuLaunchGrid,
  9873. + * ::cuLaunchKernel
  9874. + */
  9875. +CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
  9876. +
  9877. +
  9878. +/**
  9879. + * \brief Adds a texture-reference to the function's argument list
  9880. + *
  9881. + * \deprecated
  9882. + *
  9883. + * Makes the CUDA array or linear memory bound to the texture reference
  9884. + * \p hTexRef available to a device program as a texture. In this version of
  9885. + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
  9886. + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
  9887. + *
  9888. + * \param hfunc - Kernel to add texture-reference to
  9889. + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
  9890. + * \param hTexRef - Texture-reference to add to argument list
  9891. + *
  9892. + * \return
  9893. + * ::CUDA_SUCCESS,
  9894. + * ::CUDA_ERROR_DEINITIALIZED,
  9895. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9896. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9897. + * ::CUDA_ERROR_INVALID_VALUE
  9898. + * \notefnerr
  9899. + */
  9900. +CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
  9901. +/** @} */ /* END CUDA_EXEC_DEPRECATED */
  9902. +
  9903. +/** @} */ /* END CUDA_EXEC */
  9904. +
  9905. +
  9906. +/**
  9907. + * \defgroup CUDA_TEXREF Texture Reference Management
  9908. + *
  9909. + * This section describes the texture reference management functions of the
  9910. + * low-level CUDA driver application programming interface.
  9911. + *
  9912. + * @{
  9913. + */
  9914. +
  9915. +/**
  9916. + * \brief Binds an array as a texture reference
  9917. + *
  9918. + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
  9919. + * previous address or CUDA array state associated with the texture reference
  9920. + * is superseded by this function. \p Flags must be set to
  9921. + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
  9922. + * unbound.
  9923. + *
  9924. + * \param hTexRef - Texture reference to bind
  9925. + * \param hArray - Array to bind
  9926. + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
  9927. + *
  9928. + * \return
  9929. + * ::CUDA_SUCCESS,
  9930. + * ::CUDA_ERROR_DEINITIALIZED,
  9931. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9932. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9933. + * ::CUDA_ERROR_INVALID_VALUE
  9934. + *
  9935. + * \sa ::cuTexRefSetAddress,
  9936. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
  9937. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  9938. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  9939. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  9940. + */
  9941. +CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
  9942. +
  9943. +#if __CUDA_API_VERSION >= 3020
  9944. +/**
  9945. + * \brief Binds an address as a texture reference
  9946. + *
  9947. + * Binds a linear address range to the texture reference \p hTexRef. Any
  9948. + * previous address or CUDA array state associated with the texture reference
  9949. + * is superseded by this function. Any memory previously bound to \p hTexRef
  9950. + * is unbound.
  9951. + *
  9952. + * Since the hardware enforces an alignment requirement on texture base
  9953. + * addresses, ::cuTexRefSetAddress() passes back a byte offset in
  9954. + * \p *ByteOffset that must be applied to texture fetches in order to read from
  9955. + * the desired memory. This offset must be divided by the texel size and
  9956. + * passed to kernels that read from the texture so they can be applied to the
  9957. + * ::tex1Dfetch() function.
  9958. + *
  9959. + * If the device memory pointer was returned from ::cuMemAlloc(), the offset
  9960. + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
  9961. + *
  9962. + * The total number of elements (or texels) in the linear address range
  9963. + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
  9964. + * The number of elements is computed as (\p bytes / bytesPerElement),
  9965. + * where bytesPerElement is determined from the data format and number of
  9966. + * components set using ::cuTexRefSetFormat().
  9967. + *
  9968. + * \param ByteOffset - Returned byte offset
  9969. + * \param hTexRef - Texture reference to bind
  9970. + * \param dptr - Device pointer to bind
  9971. + * \param bytes - Size of memory to bind in bytes
  9972. + *
  9973. + * \return
  9974. + * ::CUDA_SUCCESS,
  9975. + * ::CUDA_ERROR_DEINITIALIZED,
  9976. + * ::CUDA_ERROR_NOT_INITIALIZED,
  9977. + * ::CUDA_ERROR_INVALID_CONTEXT,
  9978. + * ::CUDA_ERROR_INVALID_VALUE
  9979. + *
  9980. + * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  9981. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  9982. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  9983. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  9984. + */
  9985. +CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
  9986. +
  9987. +/**
  9988. + * \brief Binds an address as a 2D texture reference
  9989. + *
  9990. + * Binds a linear address range to the texture reference \p hTexRef. Any
  9991. + * previous address or CUDA array state associated with the texture reference
  9992. + * is superseded by this function. Any memory previously bound to \p hTexRef
  9993. + * is unbound.
  9994. + *
  9995. + * Using a ::tex2D() function inside a kernel requires a call to either
  9996. + * ::cuTexRefSetArray() to bind the corresponding texture reference to an
  9997. + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
  9998. + * memory.
  9999. + *
  10000. + * Function calls to ::cuTexRefSetFormat() cannot follow calls to
  10001. + * ::cuTexRefSetAddress2D() for the same texture reference.
  10002. + *
  10003. + * It is required that \p dptr be aligned to the appropriate hardware-specific
  10004. + * texture alignment. You can query this value using the device attribute
  10005. + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
  10006. + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
  10007. + *
  10008. + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
  10009. + * This value can be queried using the device attribute
  10010. + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
  10011. + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
  10012. + *
  10013. + * Width and Height, which are specified in elements (or texels), cannot exceed
  10014. + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
  10015. + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
  10016. + * \p Pitch, which is specified in bytes, cannot exceed
  10017. + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
  10018. + *
  10019. + * \param hTexRef - Texture reference to bind
  10020. + * \param desc - Descriptor of CUDA array
  10021. + * \param dptr - Device pointer to bind
  10022. + * \param Pitch - Line pitch in bytes
  10023. + *
  10024. + * \return
  10025. + * ::CUDA_SUCCESS,
  10026. + * ::CUDA_ERROR_DEINITIALIZED,
  10027. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10028. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10029. + * ::CUDA_ERROR_INVALID_VALUE
  10030. + *
  10031. + * \sa ::cuTexRefSetAddress,
  10032. + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10033. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10034. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10035. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10036. + */
  10037. +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
  10038. +#endif /* __CUDA_API_VERSION >= 3020 */
  10039. +
  10040. +/**
  10041. + * \brief Sets the format for a texture reference
  10042. + *
  10043. + * Specifies the format of the data to be read by the texture reference
  10044. + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
  10045. + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
  10046. + * They specify the format of each component and the number of components per
  10047. + * array element.
  10048. + *
  10049. + * \param hTexRef - Texture reference
  10050. + * \param fmt - Format to set
  10051. + * \param NumPackedComponents - Number of components per array element
  10052. + *
  10053. + * \return
  10054. + * ::CUDA_SUCCESS,
  10055. + * ::CUDA_ERROR_DEINITIALIZED,
  10056. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10057. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10058. + * ::CUDA_ERROR_INVALID_VALUE
  10059. + *
  10060. + * \sa ::cuTexRefSetAddress,
  10061. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10062. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
  10063. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10064. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10065. + */
  10066. +CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
  10067. +
  10068. +/**
  10069. + * \brief Sets the addressing mode for a texture reference
  10070. + *
  10071. + * Specifies the addressing mode \p am for the given dimension \p dim of the
  10072. + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
  10073. + * applied to the first parameter of the functions used to fetch from the
  10074. + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
  10075. + * as:
  10076. + * \code
  10077. + typedef enum CUaddress_mode_enum {
  10078. + CU_TR_ADDRESS_MODE_WRAP = 0,
  10079. + CU_TR_ADDRESS_MODE_CLAMP = 1,
  10080. + CU_TR_ADDRESS_MODE_MIRROR = 2,
  10081. + CU_TR_ADDRESS_MODE_BORDER = 3
  10082. + } CUaddress_mode;
  10083. + * \endcode
  10084. + *
  10085. + * Note that this call has no effect if \p hTexRef is bound to linear memory.
  10086. + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
  10087. + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
  10088. + *
  10089. + * \param hTexRef - Texture reference
  10090. + * \param dim - Dimension
  10091. + * \param am - Addressing mode to set
  10092. + *
  10093. + * \return
  10094. + * ::CUDA_SUCCESS,
  10095. + * ::CUDA_ERROR_DEINITIALIZED,
  10096. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10097. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10098. + * ::CUDA_ERROR_INVALID_VALUE
  10099. + *
  10100. + * \sa ::cuTexRefSetAddress,
  10101. + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
  10102. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10103. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10104. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10105. + */
  10106. +CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
  10107. +
  10108. +/**
  10109. + * \brief Sets the filtering mode for a texture reference
  10110. + *
  10111. + * Specifies the filtering mode \p fm to be used when reading memory through
  10112. + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
  10113. + *
  10114. + * \code
  10115. + typedef enum CUfilter_mode_enum {
  10116. + CU_TR_FILTER_MODE_POINT = 0,
  10117. + CU_TR_FILTER_MODE_LINEAR = 1
  10118. + } CUfilter_mode;
  10119. + * \endcode
  10120. + *
  10121. + * Note that this call has no effect if \p hTexRef is bound to linear memory.
  10122. + *
  10123. + * \param hTexRef - Texture reference
  10124. + * \param fm - Filtering mode to set
  10125. + *
  10126. + * \return
  10127. + * ::CUDA_SUCCESS,
  10128. + * ::CUDA_ERROR_DEINITIALIZED,
  10129. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10130. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10131. + * ::CUDA_ERROR_INVALID_VALUE
  10132. + *
  10133. + * \sa ::cuTexRefSetAddress,
  10134. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10135. + * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10136. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10137. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10138. + */
  10139. +CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
  10140. +
  10141. +/**
  10142. + * \brief Sets the flags for a texture reference
  10143. + *
  10144. + * Specifies optional flags via \p Flags to specify the behavior of data
  10145. + * returned through the texture reference \p hTexRef. The valid flags are:
  10146. + *
  10147. + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
  10148. + * having the texture promote integer data to floating point data in the
  10149. + * range [0, 1]. Note that texture with 32-bit integer format
  10150. + * would not be promoted, regardless of whether or not this
  10151. + * flag is specified;
  10152. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
  10153. + * default behavior of having the texture coordinates range
  10154. + * from [0, Dim) where Dim is the width or height of the CUDA
  10155. + * array. Instead, the texture coordinates [0, 1.0) reference
  10156. + * the entire breadth of the array dimension;
  10157. + *
  10158. + * \param hTexRef - Texture reference
  10159. + * \param Flags - Optional flags to set
  10160. + *
  10161. + * \return
  10162. + * ::CUDA_SUCCESS,
  10163. + * ::CUDA_ERROR_DEINITIALIZED,
  10164. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10165. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10166. + * ::CUDA_ERROR_INVALID_VALUE
  10167. + *
  10168. + * \sa ::cuTexRefSetAddress,
  10169. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10170. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
  10171. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10172. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10173. + */
  10174. +CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
  10175. +
  10176. +#if __CUDA_API_VERSION >= 3020
  10177. +/**
  10178. + * \brief Gets the address associated with a texture reference
  10179. + *
  10180. + * Returns in \p *pdptr the base address bound to the texture reference
  10181. + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
  10182. + * is not bound to any device memory range.
  10183. + *
  10184. + * \param pdptr - Returned device address
  10185. + * \param hTexRef - Texture reference
  10186. + *
  10187. + * \return
  10188. + * ::CUDA_SUCCESS,
  10189. + * ::CUDA_ERROR_DEINITIALIZED,
  10190. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10191. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10192. + * ::CUDA_ERROR_INVALID_VALUE
  10193. + *
  10194. + * \sa ::cuTexRefSetAddress,
  10195. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10196. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10197. + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10198. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10199. + */
  10200. +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
  10201. +#endif /* __CUDA_API_VERSION >= 3020 */
  10202. +
  10203. +/**
  10204. + * \brief Gets the array bound to a texture reference
  10205. + *
  10206. + * Returns in \p *phArray the CUDA array bound to the texture reference
  10207. + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
  10208. + * is not bound to any CUDA array.
  10209. + *
  10210. + * \param phArray - Returned array
  10211. + * \param hTexRef - Texture reference
  10212. + *
  10213. + * \return
  10214. + * ::CUDA_SUCCESS,
  10215. + * ::CUDA_ERROR_DEINITIALIZED,
  10216. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10217. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10218. + * ::CUDA_ERROR_INVALID_VALUE
  10219. + *
  10220. + * \sa ::cuTexRefSetAddress,
  10221. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10222. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10223. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
  10224. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10225. + */
  10226. +CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
  10227. +
  10228. +/**
  10229. + * \brief Gets the addressing mode used by a texture reference
  10230. + *
  10231. + * Returns in \p *pam the addressing mode corresponding to the
  10232. + * dimension \p dim of the texture reference \p hTexRef. Currently, the only
  10233. + * valid value for \p dim are 0 and 1.
  10234. + *
  10235. + * \param pam - Returned addressing mode
  10236. + * \param hTexRef - Texture reference
  10237. + * \param dim - Dimension
  10238. + *
  10239. + * \return
  10240. + * ::CUDA_SUCCESS,
  10241. + * ::CUDA_ERROR_DEINITIALIZED,
  10242. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10243. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10244. + * ::CUDA_ERROR_INVALID_VALUE
  10245. + *
  10246. + * \sa ::cuTexRefSetAddress,
  10247. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10248. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10249. + * ::cuTexRefGetAddress, ::cuTexRefGetArray,
  10250. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10251. + */
  10252. +CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
  10253. +
  10254. +/**
  10255. + * \brief Gets the filter-mode used by a texture reference
  10256. + *
  10257. + * Returns in \p *pfm the filtering mode of the texture reference
  10258. + * \p hTexRef.
  10259. + *
  10260. + * \param pfm - Returned filtering mode
  10261. + * \param hTexRef - Texture reference
  10262. + *
  10263. + * \return
  10264. + * ::CUDA_SUCCESS,
  10265. + * ::CUDA_ERROR_DEINITIALIZED,
  10266. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10267. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10268. + * ::CUDA_ERROR_INVALID_VALUE
  10269. + *
  10270. + * \sa ::cuTexRefSetAddress,
  10271. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10272. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10273. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10274. + * ::cuTexRefGetFlags, ::cuTexRefGetFormat
  10275. + */
  10276. +CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
  10277. +
  10278. +/**
  10279. + * \brief Gets the format used by a texture reference
  10280. + *
  10281. + * Returns in \p *pFormat and \p *pNumChannels the format and number
  10282. + * of components of the CUDA array bound to the texture reference \p hTexRef.
  10283. + * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
  10284. + *
  10285. + * \param pFormat - Returned format
  10286. + * \param pNumChannels - Returned number of components
  10287. + * \param hTexRef - Texture reference
  10288. + *
  10289. + * \return
  10290. + * ::CUDA_SUCCESS,
  10291. + * ::CUDA_ERROR_DEINITIALIZED,
  10292. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10293. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10294. + * ::CUDA_ERROR_INVALID_VALUE
  10295. + *
  10296. + * \sa ::cuTexRefSetAddress,
  10297. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10298. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10299. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10300. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
  10301. + */
  10302. +CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
  10303. +
  10304. +/**
  10305. + * \brief Gets the flags used by a texture reference
  10306. + *
  10307. + * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
  10308. + *
  10309. + * \param pFlags - Returned flags
  10310. + * \param hTexRef - Texture reference
  10311. + *
  10312. + * \return
  10313. + * ::CUDA_SUCCESS,
  10314. + * ::CUDA_ERROR_DEINITIALIZED,
  10315. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10316. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10317. + * ::CUDA_ERROR_INVALID_VALUE
  10318. + *
  10319. + * \sa ::cuTexRefSetAddress,
  10320. + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
  10321. + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
  10322. + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  10323. + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
  10324. + */
  10325. +CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
  10326. +
  10327. +/**
  10328. + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
  10329. + *
  10330. + * This section describes the deprecated texture reference management
  10331. + * functions of the low-level CUDA driver application programming interface.
  10332. + *
  10333. + * @{
  10334. + */
  10335. +
  10336. +/**
  10337. + * \brief Creates a texture reference
  10338. + *
  10339. + * \deprecated
  10340. + *
  10341. + * Creates a texture reference and returns its handle in \p *pTexRef. Once
  10342. + * created, the application must call ::cuTexRefSetArray() or
  10343. + * ::cuTexRefSetAddress() to associate the reference with allocated memory.
  10344. + * Other texture reference functions are used to specify the format and
  10345. + * interpretation (addressing, filtering, etc.) to be used when the memory is
  10346. + * read through this texture reference.
  10347. + *
  10348. + * \param pTexRef - Returned texture reference
  10349. + *
  10350. + * \return
  10351. + * ::CUDA_SUCCESS,
  10352. + * ::CUDA_ERROR_DEINITIALIZED,
  10353. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10354. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10355. + * ::CUDA_ERROR_INVALID_VALUE
  10356. + *
  10357. + * \sa ::cuTexRefDestroy
  10358. + */
  10359. +CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
  10360. +
  10361. +/**
  10362. + * \brief Destroys a texture reference
  10363. + *
  10364. + * \deprecated
  10365. + *
  10366. + * Destroys the texture reference specified by \p hTexRef.
  10367. + *
  10368. + * \param hTexRef - Texture reference to destroy
  10369. + *
  10370. + * \return
  10371. + * ::CUDA_SUCCESS,
  10372. + * ::CUDA_ERROR_DEINITIALIZED,
  10373. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10374. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10375. + * ::CUDA_ERROR_INVALID_VALUE
  10376. + *
  10377. + * \sa ::cuTexRefCreate
  10378. + */
  10379. +CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
  10380. +
  10381. +/** @} */ /* END CUDA_TEXREF_DEPRECATED */
  10382. +
  10383. +/** @} */ /* END CUDA_TEXREF */
  10384. +
  10385. +
  10386. +/**
  10387. + * \defgroup CUDA_SURFREF Surface Reference Management
  10388. + *
  10389. + * This section describes the surface reference management functions of the
  10390. + * low-level CUDA driver application programming interface.
  10391. + *
  10392. + * @{
  10393. + */
  10394. +
  10395. +/**
  10396. + * \brief Sets the CUDA array for a surface reference.
  10397. + *
  10398. + * Sets the CUDA array \p hArray to be read and written by the surface reference
  10399. + * \p hSurfRef. Any previous CUDA array state associated with the surface
  10400. + * reference is superseded by this function. \p Flags must be set to 0.
  10401. + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
  10402. + * Any CUDA array previously bound to \p hSurfRef is unbound.
  10403. +
  10404. + * \param hSurfRef - Surface reference handle
  10405. + * \param hArray - CUDA array handle
  10406. + * \param Flags - set to 0
  10407. + *
  10408. + * \return
  10409. + * ::CUDA_SUCCESS,
  10410. + * ::CUDA_ERROR_DEINITIALIZED,
  10411. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10412. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10413. + * ::CUDA_ERROR_INVALID_VALUE
  10414. + *
  10415. + * \sa ::cuModuleGetSurfRef, ::cuSurfRefGetArray
  10416. + */
  10417. +CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
  10418. +
  10419. +/**
  10420. + * \brief Passes back the CUDA array bound to a surface reference.
  10421. + *
  10422. + * Returns in \p *phArray the CUDA array bound to the surface reference
  10423. + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
  10424. + * is not bound to any CUDA array.
  10425. +
  10426. + * \param phArray - Surface reference handle
  10427. + * \param hSurfRef - Surface reference handle
  10428. + *
  10429. + * \return
  10430. + * ::CUDA_SUCCESS,
  10431. + * ::CUDA_ERROR_DEINITIALIZED,
  10432. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10433. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10434. + * ::CUDA_ERROR_INVALID_VALUE
  10435. + *
  10436. + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
  10437. + */
  10438. +CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
  10439. +
  10440. +/** @} */ /* END CUDA_SURFREF */
  10441. +
  10442. +#if __CUDA_API_VERSION >= 4000
  10443. +/**
  10444. + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
  10445. + *
  10446. + * This section describes the direct peer context memory access functions
  10447. + * of the low-level CUDA driver application programming interface.
  10448. + *
  10449. + * @{
  10450. + */
  10451. +
  10452. +/**
  10453. + * \brief Queries if a device may directly access a peer device's memory.
  10454. + *
  10455. + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
  10456. + * directly accessing memory from contexts on \p peerDev and 0 otherwise.
  10457. + * If direct access of \p peerDev from \p dev is possible, then access may be
  10458. + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
  10459. + *
  10460. + * \param canAccessPeer - Returned access capability
  10461. + * \param dev - Device from which allocations on \p peerDev are to
  10462. + * be directly accessed.
  10463. + * \param peerDev - Device on which the allocations to be directly accessed
  10464. + * by \p dev reside.
  10465. + *
  10466. + * \return
  10467. + * ::CUDA_SUCCESS,
  10468. + * ::CUDA_ERROR_DEINITIALIZED,
  10469. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10470. + * ::CUDA_ERROR_INVALID_DEVICE
  10471. + * \notefnerr
  10472. + *
  10473. + * \sa ::cuCtxEnablePeerAccess,
  10474. + * ::cuCtxDisablePeerAccess
  10475. + */
  10476. +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
  10477. +
  10478. +/**
  10479. + * \brief Enables direct access to memory allocations in a peer context.
  10480. + *
  10481. + If both the current context and \p peerContext are on devices which support unified
  10482. + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING), then
  10483. + * on success all allocations from \p peerContext will immediately be accessible
  10484. + * by the current context. See \ref CUDA_UNIFIED for additional
  10485. + * details.
  10486. + *
  10487. + * Note that access granted by this call is unidirectional and that in order to access
  10488. + * memory from the current context in \p peerContext, a separate symmetric call
  10489. + * to ::cuCtxEnablePeerAccess() is required.
  10490. + *
  10491. + * Returns ::CUDA_ERROR_INVALID_DEVICE if ::cuDeviceCanAccessPeer() indicates
  10492. + * that the ::CUdevice of the current context cannot directly access memory
  10493. + * from the ::CUdevice of \p peerContext.
  10494. + *
  10495. + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
  10496. + * \p peerContext from the current context has already been enabled.
  10497. + *
  10498. + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
  10499. + * because hardware resources required for peer access have been exhausted.
  10500. + *
  10501. + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
  10502. + * is not a valid context, or if the current context is \p peerContext.
  10503. + *
  10504. + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
  10505. + *
  10506. + * \param peerContext - Peer context to enable direct access to from the current context
  10507. + * \param Flags - Reserved for future use and must be set to 0
  10508. + *
  10509. + * \return
  10510. + * ::CUDA_SUCCESS,
  10511. + * ::CUDA_ERROR_DEINITIALIZED,
  10512. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10513. + * ::CUDA_ERROR_INVALID_DEVICE,
  10514. + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
  10515. + * ::CUDA_ERROR_TOO_MANY_PEERS,
  10516. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10517. + * ::CUDA_ERROR_INVALID_VALUE
  10518. + * \notefnerr
  10519. + *
  10520. + * \sa ::cuDeviceCanAccessPeer,
  10521. + * ::cuCtxDisablePeerAccess
  10522. + */
  10523. +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
  10524. +
  10525. +/**
  10526. + * \brief Disables direct access to memory allocations in a peer context and
  10527. + * unregisters any registered allocations.
  10528. + *
  10529. + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
  10530. + * not yet been enabled from \p peerContext to the current context.
  10531. + *
  10532. + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
  10533. + * \p peerContext is not a valid context.
  10534. + *
  10535. + * \param peerContext - Peer context to disable direct access to
  10536. + *
  10537. + * \return
  10538. + * ::CUDA_SUCCESS,
  10539. + * ::CUDA_ERROR_DEINITIALIZED,
  10540. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10541. + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
  10542. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10543. + * \notefnerr
  10544. + *
  10545. + * \sa ::cuDeviceCanAccessPeer,
  10546. + * ::cuCtxEnablePeerAccess
  10547. + */
  10548. +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
  10549. +
  10550. +/** @} */ /* END CUDA_PEER_ACCESS */
  10551. +#endif /* __CUDA_API_VERSION >= 4000 */
  10552. +
  10553. +/**
  10554. + * \defgroup CUDA_GRAPHICS Graphics Interoperability
  10555. + *
  10556. + * This section describes the graphics interoperability functions of the
  10557. + * low-level CUDA driver application programming interface.
  10558. + *
  10559. + * @{
  10560. + */
  10561. +
  10562. +/**
  10563. + * \brief Unregisters a graphics resource for access by CUDA
  10564. + *
  10565. + * Unregisters the graphics resource \p resource so it is not accessible by
  10566. + * CUDA unless registered again.
  10567. + *
  10568. + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
  10569. + * returned.
  10570. + *
  10571. + * \param resource - Resource to unregister
  10572. + *
  10573. + * \return
  10574. + * ::CUDA_SUCCESS,
  10575. + * ::CUDA_ERROR_DEINITIALIZED,
  10576. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10577. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10578. + * ::CUDA_ERROR_INVALID_HANDLE,
  10579. + * ::CUDA_ERROR_UNKNOWN
  10580. + * \notefnerr
  10581. + *
  10582. + * \sa
  10583. + * ::cuGraphicsD3D9RegisterResource,
  10584. + * ::cuGraphicsD3D10RegisterResource,
  10585. + * ::cuGraphicsD3D11RegisterResource,
  10586. + * ::cuGraphicsGLRegisterBuffer,
  10587. + * ::cuGraphicsGLRegisterImage
  10588. + */
  10589. +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
  10590. +
  10591. +/**
  10592. + * \brief Get an array through which to access a subresource of a mapped graphics resource.
  10593. + *
  10594. + * Returns in \p *pArray an array through which the subresource of the mapped
  10595. + * graphics resource \p resource which corresponds to array index \p arrayIndex
  10596. + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may
  10597. + * change every time that \p resource is mapped.
  10598. + *
  10599. + * If \p resource is not a texture then it cannot be accessed via an array and
  10600. + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
  10601. + * If \p arrayIndex is not a valid array index for \p resource then
  10602. + * ::CUDA_ERROR_INVALID_VALUE is returned.
  10603. + * If \p mipLevel is not a valid mipmap level for \p resource then
  10604. + * ::CUDA_ERROR_INVALID_VALUE is returned.
  10605. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
  10606. + *
  10607. + * \param pArray - Returned array through which a subresource of \p resource may be accessed
  10608. + * \param resource - Mapped resource to access
  10609. + * \param arrayIndex - Array index for array textures or cubemap face
  10610. + * index as defined by ::CUarray_cubemap_face for
  10611. + * cubemap textures for the subresource to access
  10612. + * \param mipLevel - Mipmap level for the subresource to access
  10613. + *
  10614. + * \return
  10615. + * ::CUDA_SUCCESS,
  10616. + * ::CUDA_ERROR_DEINITIALIZED,
  10617. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10618. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10619. + * ::CUDA_ERROR_INVALID_VALUE,
  10620. + * ::CUDA_ERROR_INVALID_HANDLE,
  10621. + * ::CUDA_ERROR_NOT_MAPPED
  10622. + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
  10623. + * \notefnerr
  10624. + *
  10625. + * \sa ::cuGraphicsResourceGetMappedPointer
  10626. + */
  10627. +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
  10628. +
  10629. +#if __CUDA_API_VERSION >= 3020
  10630. +/**
  10631. + * \brief Get a device pointer through which to access a mapped graphics resource.
  10632. + *
  10633. + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
  10634. + * \p resource may be accessed.
  10635. + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
  10636. + * The value set in \p pPointer may change every time that \p resource is mapped.
  10637. + *
  10638. + * If \p resource is not a buffer then it cannot be accessed via a pointer and
  10639. + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
  10640. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
  10641. + * *
  10642. + * \param pDevPtr - Returned pointer through which \p resource may be accessed
  10643. + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer
  10644. + * \param resource - Mapped resource to access
  10645. + *
  10646. + * \return
  10647. + * ::CUDA_SUCCESS,
  10648. + * ::CUDA_ERROR_DEINITIALIZED,
  10649. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10650. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10651. + * ::CUDA_ERROR_INVALID_VALUE,
  10652. + * ::CUDA_ERROR_INVALID_HANDLE,
  10653. + * ::CUDA_ERROR_NOT_MAPPED
  10654. + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
  10655. + * \notefnerr
  10656. + *
  10657. + * \sa
  10658. + * ::cuGraphicsMapResources,
  10659. + * ::cuGraphicsSubResourceGetMappedArray
  10660. + */
  10661. +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
  10662. +#endif /* __CUDA_API_VERSION >= 3020 */
  10663. +
  10664. +/**
  10665. + * \brief Set usage flags for mapping a graphics resource
  10666. + *
  10667. + * Set \p flags for mapping the graphics resource \p resource.
  10668. + *
  10669. + * Changes to \p flags will take effect the next time \p resource is mapped.
  10670. + * The \p flags argument may be any of the following:
  10671. +
  10672. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
  10673. + * resource will be used. It is therefore assumed that this resource will be
  10674. + * read from and written to by CUDA kernels. This is the default value.
  10675. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
  10676. + * access this resource will not write to this resource.
  10677. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
  10678. + * which access this resource will not read from this resource and will
  10679. + * write over the entire contents of the resource, so none of the data
  10680. + * previously stored in the resource will be preserved.
  10681. + *
  10682. + * If \p resource is presently mapped for access by CUDA then
  10683. + * ::CUDA_ERROR_ALREADY_MAPPED is returned.
  10684. + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
  10685. + *
  10686. + * \param resource - Registered resource to set flags for
  10687. + * \param flags - Parameters for resource mapping
  10688. + *
  10689. + * \return
  10690. + * ::CUDA_SUCCESS,
  10691. + * ::CUDA_ERROR_DEINITIALIZED,
  10692. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10693. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10694. + * ::CUDA_ERROR_INVALID_VALUE,
  10695. + * ::CUDA_ERROR_INVALID_HANDLE,
  10696. + * ::CUDA_ERROR_ALREADY_MAPPED
  10697. + * \notefnerr
  10698. + *
  10699. + * \sa
  10700. + * ::cuGraphicsMapResources
  10701. + */
  10702. +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
  10703. +
  10704. +/**
  10705. + * \brief Map graphics resources for access by CUDA
  10706. + *
  10707. + * Maps the \p count graphics resources in \p resources for access by CUDA.
  10708. + *
  10709. + * The resources in \p resources may be accessed by CUDA until they
  10710. + * are unmapped. The graphics API from which \p resources were registered
  10711. + * should not access any resources while they are mapped by CUDA. If an
  10712. + * application does so, the results are undefined.
  10713. + *
  10714. + * This function provides the synchronization guarantee that any graphics calls
  10715. + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
  10716. + * work issued in \p stream begins.
  10717. + *
  10718. + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
  10719. + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
  10720. + *
  10721. + * \param count - Number of resources to map
  10722. + * \param resources - Resources to map for CUDA usage
  10723. + * \param hStream - Stream with which to synchronize
  10724. + *
  10725. + * \return
  10726. + * ::CUDA_SUCCESS,
  10727. + * ::CUDA_ERROR_DEINITIALIZED,
  10728. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10729. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10730. + * ::CUDA_ERROR_INVALID_HANDLE,
  10731. + * ::CUDA_ERROR_ALREADY_MAPPED,
  10732. + * ::CUDA_ERROR_UNKNOWN
  10733. + * \notefnerr
  10734. + *
  10735. + * \sa
  10736. + * ::cuGraphicsResourceGetMappedPointer
  10737. + * ::cuGraphicsSubResourceGetMappedArray
  10738. + * ::cuGraphicsUnmapResources
  10739. + */
  10740. +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
  10741. +
  10742. +/**
  10743. + * \brief Unmap graphics resources.
  10744. + *
  10745. + * Unmaps the \p count graphics resources in \p resources.
  10746. + *
  10747. + * Once unmapped, the resources in \p resources may not be accessed by CUDA
  10748. + * until they are mapped again.
  10749. + *
  10750. + * This function provides the synchronization guarantee that any CUDA work issued
  10751. + * in \p stream before ::cuGraphicsUnmapResources() will complete before any
  10752. + * subsequently issued graphics work begins.
  10753. + *
  10754. + *
  10755. + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
  10756. + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
  10757. + *
  10758. + * \param count - Number of resources to unmap
  10759. + * \param resources - Resources to unmap
  10760. + * \param hStream - Stream with which to synchronize
  10761. + *
  10762. + * \return
  10763. + * ::CUDA_SUCCESS,
  10764. + * ::CUDA_ERROR_DEINITIALIZED,
  10765. + * ::CUDA_ERROR_NOT_INITIALIZED,
  10766. + * ::CUDA_ERROR_INVALID_CONTEXT,
  10767. + * ::CUDA_ERROR_INVALID_HANDLE,
  10768. + * ::CUDA_ERROR_NOT_MAPPED,
  10769. + * ::CUDA_ERROR_UNKNOWN
  10770. + * \notefnerr
  10771. + *
  10772. + * \sa
  10773. + * ::cuGraphicsMapResources
  10774. + */
  10775. +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
  10776. +
  10777. +/** @} */ /* END CUDA_GRAPHICS */
  10778. +
  10779. +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
  10780. +
  10781. +
  10782. +/** @} */ /* END CUDA_DRIVER */
  10783. +
  10784. +/**
  10785. + * CUDA API versioning support
  10786. + */
  10787. +#if defined(__CUDA_API_VERSION_INTERNAL)
  10788. + #undef cuDeviceTotalMem
  10789. + #undef cuCtxCreate
  10790. + #undef cuModuleGetGlobal
  10791. + #undef cuMemGetInfo
  10792. + #undef cuMemAlloc
  10793. + #undef cuMemAllocPitch
  10794. + #undef cuMemFree
  10795. + #undef cuMemGetAddressRange
  10796. + #undef cuMemAllocHost
  10797. + #undef cuMemHostGetDevicePointer
  10798. + #undef cuMemcpyHtoD
  10799. + #undef cuMemcpyDtoH
  10800. + #undef cuMemcpyDtoD
  10801. + #undef cuMemcpyDtoA
  10802. + #undef cuMemcpyAtoD
  10803. + #undef cuMemcpyHtoA
  10804. + #undef cuMemcpyAtoH
  10805. + #undef cuMemcpyAtoA
  10806. + #undef cuMemcpyHtoAAsync
  10807. + #undef cuMemcpyAtoHAsync
  10808. + #undef cuMemcpy2D
  10809. + #undef cuMemcpy2DUnaligned
  10810. + #undef cuMemcpy3D
  10811. + #undef cuMemcpyHtoDAsync
  10812. + #undef cuMemcpyDtoHAsync
  10813. + #undef cuMemcpyDtoDAsync
  10814. + #undef cuMemcpy2DAsync
  10815. + #undef cuMemcpy3DAsync
  10816. + #undef cuMemsetD8
  10817. + #undef cuMemsetD16
  10818. + #undef cuMemsetD32
  10819. + #undef cuMemsetD2D8
  10820. + #undef cuMemsetD2D16
  10821. + #undef cuMemsetD2D32
  10822. + #undef cuArrayCreate
  10823. + #undef cuArrayGetDescriptor
  10824. + #undef cuArray3DCreate
  10825. + #undef cuArray3DGetDescriptor
  10826. + #undef cuTexRefSetAddress
  10827. + #undef cuTexRefSetAddress2D
  10828. + #undef cuTexRefGetAddress
  10829. + #undef cuGraphicsResourceGetMappedPointer
  10830. + #undef cuCtxDestroy
  10831. + #undef cuCtxPopCurrent
  10832. + #undef cuCtxPushCurrent
  10833. + #undef cuStreamDestroy
  10834. + #undef cuEventDestroy
  10835. +#endif /* __CUDA_API_VERSION_INTERNAL */
  10836. +
  10837. +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010)
  10838. +CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
  10839. +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */
  10840. +
  10841. +/**
  10842. + * CUDA API made obselete at API version 3020
  10843. + */
  10844. +#if defined(__CUDA_API_VERSION_INTERNAL)
  10845. + #define CUdeviceptr CUdeviceptr_v1
  10846. + #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st
  10847. + #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1
  10848. + #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st
  10849. + #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1
  10850. + #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st
  10851. + #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1
  10852. + #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st
  10853. + #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1
  10854. +#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
  10855. +
  10856. +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
  10857. +
  10858. +typedef unsigned int CUdeviceptr;
  10859. +
  10860. +typedef struct CUDA_MEMCPY2D_st
  10861. +{
  10862. + unsigned int srcXInBytes; /**< Source X in bytes */
  10863. + unsigned int srcY; /**< Source Y */
  10864. + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
  10865. + const void *srcHost; /**< Source host pointer */
  10866. + CUdeviceptr srcDevice; /**< Source device pointer */
  10867. + CUarray srcArray; /**< Source array reference */
  10868. + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
  10869. +
  10870. + unsigned int dstXInBytes; /**< Destination X in bytes */
  10871. + unsigned int dstY; /**< Destination Y */
  10872. + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
  10873. + void *dstHost; /**< Destination host pointer */
  10874. + CUdeviceptr dstDevice; /**< Destination device pointer */
  10875. + CUarray dstArray; /**< Destination array reference */
  10876. + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
  10877. +
  10878. + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */
  10879. + unsigned int Height; /**< Height of 2D memory copy */
  10880. +} CUDA_MEMCPY2D;
  10881. +
  10882. +typedef struct CUDA_MEMCPY3D_st
  10883. +{
  10884. + unsigned int srcXInBytes; /**< Source X in bytes */
  10885. + unsigned int srcY; /**< Source Y */
  10886. + unsigned int srcZ; /**< Source Z */
  10887. + unsigned int srcLOD; /**< Source LOD */
  10888. + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
  10889. + const void *srcHost; /**< Source host pointer */
  10890. + CUdeviceptr srcDevice; /**< Source device pointer */
  10891. + CUarray srcArray; /**< Source array reference */
  10892. + void *reserved0; /**< Must be NULL */
  10893. + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
  10894. + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
  10895. +
  10896. + unsigned int dstXInBytes; /**< Destination X in bytes */
  10897. + unsigned int dstY; /**< Destination Y */
  10898. + unsigned int dstZ; /**< Destination Z */
  10899. + unsigned int dstLOD; /**< Destination LOD */
  10900. + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
  10901. + void *dstHost; /**< Destination host pointer */
  10902. + CUdeviceptr dstDevice; /**< Destination device pointer */
  10903. + CUarray dstArray; /**< Destination array reference */
  10904. + void *reserved1; /**< Must be NULL */
  10905. + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
  10906. + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
  10907. +
  10908. + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */
  10909. + unsigned int Height; /**< Height of 3D memory copy */
  10910. + unsigned int Depth; /**< Depth of 3D memory copy */
  10911. +} CUDA_MEMCPY3D;
  10912. +
  10913. +typedef struct CUDA_ARRAY_DESCRIPTOR_st
  10914. +{
  10915. + unsigned int Width; /**< Width of array */
  10916. + unsigned int Height; /**< Height of array */
  10917. +
  10918. + CUarray_format Format; /**< Array format */
  10919. + unsigned int NumChannels; /**< Channels per array element */
  10920. +} CUDA_ARRAY_DESCRIPTOR;
  10921. +
  10922. +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
  10923. +{
  10924. + unsigned int Width; /**< Width of 3D array */
  10925. + unsigned int Height; /**< Height of 3D array */
  10926. + unsigned int Depth; /**< Depth of 3D array */
  10927. +
  10928. + CUarray_format Format; /**< Array format */
  10929. + unsigned int NumChannels; /**< Channels per array element */
  10930. + unsigned int Flags; /**< Flags */
  10931. +} CUDA_ARRAY3D_DESCRIPTOR;
  10932. +
  10933. +CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
  10934. +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
  10935. +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
  10936. +CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
  10937. +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
  10938. +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
  10939. +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
  10940. +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
  10941. +CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
  10942. +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
  10943. +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
  10944. +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
  10945. +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
  10946. +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
  10947. +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
  10948. +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
  10949. +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
  10950. +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
  10951. +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
  10952. +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
  10953. +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
  10954. +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
  10955. +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
  10956. +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
  10957. +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
  10958. +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
  10959. +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
  10960. +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
  10961. +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
  10962. +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
  10963. +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
  10964. +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
  10965. +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
  10966. +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
  10967. +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
  10968. +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
  10969. +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
  10970. +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
  10971. +CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
  10972. +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
  10973. +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
  10974. +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
  10975. +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */
  10976. +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000
  10977. +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
  10978. +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
  10979. +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
  10980. +CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
  10981. +CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
  10982. +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */
  10983. +
  10984. +#if defined(__CUDA_API_VERSION_INTERNAL)
  10985. + #undef CUdeviceptr
  10986. + #undef CUDA_MEMCPY2D_st
  10987. + #undef CUDA_MEMCPY2D
  10988. + #undef CUDA_MEMCPY3D_st
  10989. + #undef CUDA_MEMCPY3D
  10990. + #undef CUDA_ARRAY_DESCRIPTOR_st
  10991. + #undef CUDA_ARRAY_DESCRIPTOR
  10992. + #undef CUDA_ARRAY3D_DESCRIPTOR_st
  10993. + #undef CUDA_ARRAY3D_DESCRIPTOR
  10994. +#endif /* __CUDA_API_VERSION_INTERNAL */
  10995. +
  10996. +#ifdef __cplusplus
  10997. +}
  10998. +#endif
  10999. +
  11000. +#undef __CUDA_API_VERSION
  11001. +
  11002. +#endif /* __cuda_cuda_h__ */
  11003. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda_dynlink.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda_dynlink.h
  11004. new file mode 100644
  11005. index 0000000..24434ee
  11006. --- /dev/null
  11007. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda_dynlink.h
  11008. @@ -0,0 +1,73 @@
  11009. +/*
  11010. + * Copyright (C) 2011 Hendrik Leppkes
  11011. + * http://www.1f0.de
  11012. + *
  11013. + * This Program is free software; you can redistribute it and/or modify
  11014. + * it under the terms of the GNU General Public License as published by
  11015. + * the Free Software Foundation; either version 2, or (at your option)
  11016. + * any later version.
  11017. + *
  11018. + * This Program is distributed in the hope that it will be useful,
  11019. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11020. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11021. + * GNU General Public License for more details.
  11022. + *
  11023. + * You should have received a copy of the GNU General Public License
  11024. + * along with this program; see the file COPYING. If not, write to
  11025. + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  11026. + * http://www.gnu.org/copyleft/gpl.html
  11027. + *
  11028. + * Assembled from parts of the NVIDIA CUDA SDK, Copyright by NVIDIA, All rights reserved.
  11029. + */
  11030. +
  11031. +#pragma once
  11032. +#include "cuda.h"
  11033. +////////////////////////////////////////////////////
  11034. +/// CUDA functions
  11035. +////////////////////////////////////////////////////
  11036. +typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
  11037. +typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev );
  11038. +typedef CUresult CUDAAPI tcuCtxDestroy( CUcontext ctx );
  11039. +typedef CUresult CUDAAPI tcuCtxPushCurrent( CUcontext ctx );
  11040. +typedef CUresult CUDAAPI tcuCtxPopCurrent( CUcontext *pctx );
  11041. +typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
  11042. +typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
  11043. +typedef CUresult CUDAAPI tcuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount );
  11044. +typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
  11045. +typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
  11046. +typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
  11047. +typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
  11048. +typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
  11049. +typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);
  11050. +typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
  11051. +typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
  11052. +typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
  11053. +
  11054. +////////////////////////////////////////////////////
  11055. +/// D3D Interop
  11056. +////////////////////////////////////////////////////
  11057. +typedef CUresult CUDAAPI tcuD3D9CtxCreate( CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice );
  11058. +
  11059. +////////////////////////////////////////////////////
  11060. +/// CUVID functions
  11061. +////////////////////////////////////////////////////
  11062. +typedef CUresult CUDAAPI tcuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
  11063. +typedef CUresult CUDAAPI tcuvidCtxLockDestroy(CUvideoctxlock lck);
  11064. +typedef CUresult CUDAAPI tcuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
  11065. +typedef CUresult CUDAAPI tcuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
  11066. +
  11067. +typedef CUresult CUDAAPI tcuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
  11068. +typedef CUresult CUDAAPI tcuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
  11069. +typedef CUresult CUDAAPI tcuvidDestroyVideoParser(CUvideoparser obj);
  11070. +
  11071. +// Create/Destroy the decoder object
  11072. +typedef CUresult CUDAAPI tcuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
  11073. +typedef CUresult CUDAAPI tcuvidDestroyDecoder(CUvideodecoder hDecoder);
  11074. +
  11075. +// Decode a single picture (field or frame)
  11076. +typedef CUresult CUDAAPI tcuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
  11077. +
  11078. +// Post-process and map a video frame for use in cuda
  11079. +typedef CUresult CUDAAPI tcuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr, unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
  11080. +// Unmap a previously mapped video frame
  11081. +typedef CUresult CUDAAPI tcuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
  11082. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuviddec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuviddec.h
  11083. new file mode 100644
  11084. index 0000000..4c2674e
  11085. --- /dev/null
  11086. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuviddec.h
  11087. @@ -0,0 +1,523 @@
  11088. +/*
  11089. + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
  11090. + *
  11091. + * NOTICE TO USER:
  11092. + *
  11093. + * This source code is subject to NVIDIA ownership rights under U.S. and
  11094. + * international Copyright laws. Users and possessors of this source code
  11095. + * are hereby granted a nonexclusive, royalty-free license to use this code
  11096. + * in individual and commercial software.
  11097. + *
  11098. + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
  11099. + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
  11100. + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
  11101. + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
  11102. + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  11103. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
  11104. + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  11105. + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  11106. + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  11107. + * OR PERFORMANCE OF THIS SOURCE CODE.
  11108. + *
  11109. + * U.S. Government End Users. This source code is a "commercial item" as
  11110. + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
  11111. + * "commercial computer software" and "commercial computer software
  11112. + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
  11113. + * and is provided to the U.S. Government only as a commercial end item.
  11114. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
  11115. + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
  11116. + * source code with only those rights set forth herein.
  11117. + *
  11118. + * Any use of this source code in individual and commercial software must
  11119. + * include, in the user documentation and internal comments to the code,
  11120. + * the above Disclaimer and U.S. Government End Users Notice.
  11121. + */
  11122. +
  11123. +#if !defined(__CUDA_VIDEO_H__)
  11124. +#define __CUDA_VIDEO_H__
  11125. +
  11126. +#ifndef __cuda_cuda_h__
  11127. +#include <cuda.h>
  11128. +#endif // __cuda_cuda_h__
  11129. +
  11130. +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
  11131. +#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
  11132. +#define __CUVID_DEVPTR64
  11133. +#endif
  11134. +#endif
  11135. +
  11136. +#if defined(__cplusplus)
  11137. +extern "C" {
  11138. +#endif /* __cplusplus */
  11139. +
  11140. +typedef void *CUvideodecoder;
  11141. +typedef struct _CUcontextlock_st *CUvideoctxlock;
  11142. +
  11143. +typedef enum cudaVideoCodec_enum {
  11144. + cudaVideoCodec_MPEG1=0,
  11145. + cudaVideoCodec_MPEG2,
  11146. + cudaVideoCodec_MPEG4,
  11147. + cudaVideoCodec_VC1,
  11148. + cudaVideoCodec_H264,
  11149. + cudaVideoCodec_JPEG,
  11150. + cudaVideoCodec_H264_SVC,
  11151. + cudaVideoCodec_H264_MVC,
  11152. + cudaVideoCodec_NumCodecs,
  11153. + // Uncompressed YUV
  11154. + cudaVideoCodec_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')), // Y,U,V (4:2:0)
  11155. + cudaVideoCodec_YV12 = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')), // Y,V,U (4:2:0)
  11156. + cudaVideoCodec_NV12 = (('N'<<24)|('V'<<16)|('1'<<8)|('2')), // Y,UV (4:2:0)
  11157. + cudaVideoCodec_YUYV = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')), // YUYV/YUY2 (4:2:2)
  11158. + cudaVideoCodec_UYVY = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y')), // UYVY (4:2:2)
  11159. +} cudaVideoCodec;
  11160. +
  11161. +typedef enum cudaVideoSurfaceFormat_enum {
  11162. + cudaVideoSurfaceFormat_NV12=0, // NV12 (currently the only supported output format)
  11163. +} cudaVideoSurfaceFormat;
  11164. +
  11165. +typedef enum cudaVideoDeinterlaceMode_enum {
  11166. + cudaVideoDeinterlaceMode_Weave=0, // Weave both fields (no deinterlacing)
  11167. + cudaVideoDeinterlaceMode_Bob, // Drop one field
  11168. + cudaVideoDeinterlaceMode_Adaptive, // Adaptive deinterlacing
  11169. +} cudaVideoDeinterlaceMode;
  11170. +
  11171. +typedef enum cudaVideoChromaFormat_enum {
  11172. + cudaVideoChromaFormat_Monochrome=0,
  11173. + cudaVideoChromaFormat_420,
  11174. + cudaVideoChromaFormat_422,
  11175. + cudaVideoChromaFormat_444,
  11176. +} cudaVideoChromaFormat;
  11177. +
  11178. +typedef enum cudaVideoCreateFlags_enum {
  11179. + cudaVideoCreate_Default = 0x00, // Default operation mode: use dedicated video engines
  11180. + cudaVideoCreate_PreferCUDA = 0x01, // Use a CUDA-based decoder if faster than dedicated engines (requires a valid vidLock object for multi-threading)
  11181. + cudaVideoCreate_PreferDXVA = 0x02, // Go through DXVA internally if possible (requires D3D9 interop)
  11182. + cudaVideoCreate_PreferCUVID = 0x04, // Use dedicated video engines directly
  11183. +} cudaVideoCreateFlags;
  11184. +
  11185. +
  11186. +typedef struct _CUVIDDECODECREATEINFO
  11187. +{
  11188. + // Decoding
  11189. + unsigned long ulWidth; // Coded Sequence Width
  11190. + unsigned long ulHeight; // Coded Sequence Height
  11191. + unsigned long ulNumDecodeSurfaces; // Maximum number of internal decode surfaces
  11192. + cudaVideoCodec CodecType; // cudaVideoCodec_XXX
  11193. + cudaVideoChromaFormat ChromaFormat; // cudaVideoChromaFormat_XXX (only 4:2:0 is currently supported)
  11194. + unsigned long ulCreationFlags; // Decoder creation flags (cudaVideoCreateFlags_XXX)
  11195. + unsigned long Reserved1[5]; // Reserved for future use - set to zero
  11196. + struct { // area of the frame that should be displayed
  11197. + short left;
  11198. + short top;
  11199. + short right;
  11200. + short bottom;
  11201. + } display_area;
  11202. + // Output format
  11203. + cudaVideoSurfaceFormat OutputFormat; // cudaVideoSurfaceFormat_XXX
  11204. + cudaVideoDeinterlaceMode DeinterlaceMode; // cudaVideoDeinterlaceMode_XXX
  11205. + unsigned long ulTargetWidth; // Post-processed Output Width
  11206. + unsigned long ulTargetHeight; // Post-processed Output Height
  11207. + unsigned long ulNumOutputSurfaces; // Maximum number of output surfaces simultaneously mapped
  11208. + CUvideoctxlock vidLock; // If non-NULL, context lock used for synchronizing ownership of the cuda context
  11209. + struct { // target rectangle in the output frame (for aspect ratio conversion)
  11210. + short left;
  11211. + short top;
  11212. + short right;
  11213. + short bottom;
  11214. + } target_rect; // if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used
  11215. + unsigned long Reserved2[5]; // Reserved for future use - set to zero
  11216. +} CUVIDDECODECREATEINFO;
  11217. +
  11218. +
  11219. +////////////////////////////////////////////////////////////////////////////////////////////////
  11220. +//
  11221. +// H.264 Picture Parameters
  11222. +//
  11223. +
  11224. +typedef struct _CUVIDH264DPBENTRY
  11225. +{
  11226. + int PicIdx; // picture index of reference frame
  11227. + int FrameIdx; // frame_num(short-term) or LongTermFrameIdx(long-term)
  11228. + int is_long_term; // 0=short term reference, 1=long term reference
  11229. + int not_existing; // non-existing reference frame (corresponding PicIdx should be set to -1)
  11230. + int used_for_reference; // 0=unused, 1=top_field, 2=bottom_field, 3=both_fields
  11231. + int FieldOrderCnt[2]; // field order count of top and bottom fields
  11232. +} CUVIDH264DPBENTRY;
  11233. +
  11234. +typedef struct _CUVIDH264MVCEXT
  11235. +{
  11236. + int num_views_minus1;
  11237. + int view_id;
  11238. + unsigned char inter_view_flag;
  11239. + unsigned char num_inter_view_refs_l0;
  11240. + unsigned char num_inter_view_refs_l1;
  11241. + unsigned char MVCReserved8Bits;
  11242. + int InterViewRefsL0[16];
  11243. + int InterViewRefsL1[16];
  11244. +} CUVIDH264MVCEXT;
  11245. +
  11246. +typedef struct _CUVIDH264SVCEXT
  11247. +{
  11248. + unsigned char profile_idc;
  11249. + unsigned char level_idc;
  11250. + unsigned char DQId;
  11251. + unsigned char DQIdMax;
  11252. + unsigned char disable_inter_layer_deblocking_filter_idc;
  11253. + unsigned char ref_layer_chroma_phase_y_plus1;
  11254. + signed char inter_layer_slice_alpha_c0_offset_div2;
  11255. + signed char inter_layer_slice_beta_offset_div2;
  11256. +
  11257. + unsigned short DPBEntryValidFlag;
  11258. + unsigned char inter_layer_deblocking_filter_control_present_flag;
  11259. + unsigned char extended_spatial_scalability_idc;
  11260. + unsigned char adaptive_tcoeff_level_prediction_flag;
  11261. + unsigned char slice_header_restriction_flag;
  11262. + unsigned char chroma_phase_x_plus1_flag;
  11263. + unsigned char chroma_phase_y_plus1;
  11264. +
  11265. + unsigned char tcoeff_level_prediction_flag;
  11266. + unsigned char constrained_intra_resampling_flag;
  11267. + unsigned char ref_layer_chroma_phase_x_plus1_flag;
  11268. + unsigned char store_ref_base_pic_flag;
  11269. + unsigned char Reserved8BitsA;
  11270. + unsigned char Reserved8BitsB;
  11271. + // For the 4 scaled_ref_layer_XX fields below,
  11272. + // if (extended_spatial_scalability_idc == 1), SPS field, G.7.3.2.1.4, add prefix "seq_"
  11273. + // if (extended_spatial_scalability_idc == 2), SLH field, G.7.3.3.4,
  11274. + short scaled_ref_layer_left_offset;
  11275. + short scaled_ref_layer_top_offset;
  11276. + short scaled_ref_layer_right_offset;
  11277. + short scaled_ref_layer_bottom_offset;
  11278. + unsigned short Reserved16Bits;
  11279. + struct _CUVIDPICPARAMS *pNextLayer; // Points to the picparams for the next layer to be decoded. Linked list ends at the target layer.
  11280. + int bRefBaseLayer; // whether to store ref base pic
  11281. +} CUVIDH264SVCEXT;
  11282. +
  11283. +typedef struct _CUVIDH264PICPARAMS
  11284. +{
  11285. + // SPS
  11286. + int log2_max_frame_num_minus4;
  11287. + int pic_order_cnt_type;
  11288. + int log2_max_pic_order_cnt_lsb_minus4;
  11289. + int delta_pic_order_always_zero_flag;
  11290. + int frame_mbs_only_flag;
  11291. + int direct_8x8_inference_flag;
  11292. + int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
  11293. + unsigned char residual_colour_transform_flag;
  11294. + unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported)
  11295. + unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
  11296. + unsigned char qpprime_y_zero_transform_bypass_flag;
  11297. + // PPS
  11298. + int entropy_coding_mode_flag;
  11299. + int pic_order_present_flag;
  11300. + int num_ref_idx_l0_active_minus1;
  11301. + int num_ref_idx_l1_active_minus1;
  11302. + int weighted_pred_flag;
  11303. + int weighted_bipred_idc;
  11304. + int pic_init_qp_minus26;
  11305. + int deblocking_filter_control_present_flag;
  11306. + int redundant_pic_cnt_present_flag;
  11307. + int transform_8x8_mode_flag;
  11308. + int MbaffFrameFlag;
  11309. + int constrained_intra_pred_flag;
  11310. + int chroma_qp_index_offset;
  11311. + int second_chroma_qp_index_offset;
  11312. + int ref_pic_flag;
  11313. + int frame_num;
  11314. + int CurrFieldOrderCnt[2];
  11315. + // DPB
  11316. + CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
  11317. + // Quantization Matrices (raster-order)
  11318. + unsigned char WeightScale4x4[6][16];
  11319. + unsigned char WeightScale8x8[2][64];
  11320. + // FMO/ASO
  11321. + unsigned char fmo_aso_enable;
  11322. + unsigned char num_slice_groups_minus1;
  11323. + unsigned char slice_group_map_type;
  11324. + signed char pic_init_qs_minus26;
  11325. + unsigned int slice_group_change_rate_minus1;
  11326. + union
  11327. + {
  11328. + unsigned long long slice_group_map_addr;
  11329. + const unsigned char *pMb2SliceGroupMap;
  11330. + } fmo;
  11331. + unsigned int Reserved[12];
  11332. + // SVC/MVC
  11333. + union
  11334. + {
  11335. + CUVIDH264MVCEXT mvcext;
  11336. + CUVIDH264SVCEXT svcext;
  11337. + };
  11338. +} CUVIDH264PICPARAMS;
  11339. +
  11340. +
  11341. +////////////////////////////////////////////////////////////////////////////////////////////////
  11342. +//
  11343. +// MPEG-2 Picture Parameters
  11344. +//
  11345. +
  11346. +typedef struct _CUVIDMPEG2PICPARAMS
  11347. +{
  11348. + int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
  11349. + int BackwardRefIdx; // Picture index of backward reference (B-frames)
  11350. + int picture_coding_type;
  11351. + int full_pel_forward_vector;
  11352. + int full_pel_backward_vector;
  11353. + int f_code[2][2];
  11354. + int intra_dc_precision;
  11355. + int frame_pred_frame_dct;
  11356. + int concealment_motion_vectors;
  11357. + int q_scale_type;
  11358. + int intra_vlc_format;
  11359. + int alternate_scan;
  11360. + int top_field_first;
  11361. + // Quantization matrices (raster order)
  11362. + unsigned char QuantMatrixIntra[64];
  11363. + unsigned char QuantMatrixInter[64];
  11364. +} CUVIDMPEG2PICPARAMS;
  11365. +
  11366. +////////////////////////////////////////////////////////////////////////////////////////////////
  11367. +//
  11368. +// MPEG-4 Picture Parameters
  11369. +//
  11370. +
  11371. +// MPEG-4 has VOP types instead of Picture types
  11372. +#define I_VOP 0
  11373. +#define P_VOP 1
  11374. +#define B_VOP 2
  11375. +#define S_VOP 3
  11376. +
  11377. +typedef struct _CUVIDMPEG4PICPARAMS
  11378. +{
  11379. + int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
  11380. + int BackwardRefIdx; // Picture index of backward reference (B-frames)
  11381. + // VOL
  11382. + int video_object_layer_width;
  11383. + int video_object_layer_height;
  11384. + int vop_time_increment_bitcount;
  11385. + int top_field_first;
  11386. + int resync_marker_disable;
  11387. + int quant_type;
  11388. + int quarter_sample;
  11389. + int short_video_header;
  11390. + int divx_flags;
  11391. + // VOP
  11392. + int vop_coding_type;
  11393. + int vop_coded;
  11394. + int vop_rounding_type;
  11395. + int alternate_vertical_scan_flag;
  11396. + int interlaced;
  11397. + int vop_fcode_forward;
  11398. + int vop_fcode_backward;
  11399. + int trd[2];
  11400. + int trb[2];
  11401. + // Quantization matrices (raster order)
  11402. + unsigned char QuantMatrixIntra[64];
  11403. + unsigned char QuantMatrixInter[64];
  11404. + int gmc_enabled;
  11405. +} CUVIDMPEG4PICPARAMS;
  11406. +
  11407. +////////////////////////////////////////////////////////////////////////////////////////////////
  11408. +//
  11409. +// VC1 Picture Parameters
  11410. +//
  11411. +
  11412. +typedef struct _CUVIDVC1PICPARAMS
  11413. +{
  11414. + int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
  11415. + int BackwardRefIdx; // Picture index of backward reference (B-frames)
  11416. + int FrameWidth; // Actual frame width
  11417. + int FrameHeight; // Actual frame height
  11418. + // PICTURE
  11419. + int intra_pic_flag; // Set to 1 for I,BI frames
  11420. + int ref_pic_flag; // Set to 1 for I,P frames
  11421. + int progressive_fcm; // Progressive frame
  11422. + // SEQUENCE
  11423. + int profile;
  11424. + int postprocflag;
  11425. + int pulldown;
  11426. + int interlace;
  11427. + int tfcntrflag;
  11428. + int finterpflag;
  11429. + int psf;
  11430. + int multires;
  11431. + int syncmarker;
  11432. + int rangered;
  11433. + int maxbframes;
  11434. + // ENTRYPOINT
  11435. + int panscan_flag;
  11436. + int refdist_flag;
  11437. + int extended_mv;
  11438. + int dquant;
  11439. + int vstransform;
  11440. + int loopfilter;
  11441. + int fastuvmc;
  11442. + int overlap;
  11443. + int quantizer;
  11444. + int extended_dmv;
  11445. + int range_mapy_flag;
  11446. + int range_mapy;
  11447. + int range_mapuv_flag;
  11448. + int range_mapuv;
  11449. + int rangeredfrm; // range reduction state
  11450. +} CUVIDVC1PICPARAMS;
  11451. +
  11452. +////////////////////////////////////////////////////////////////////////////////////////////////
  11453. +//
  11454. +// JPEG Picture Parameters
  11455. +//
  11456. +
  11457. +typedef struct _CUVIDJPEGPICPARAMS
  11458. +{
  11459. + int Reserved;
  11460. +} CUVIDJPEGPICPARAMS;
  11461. +
  11462. +////////////////////////////////////////////////////////////////////////////////////////////////
  11463. +//
  11464. +// Picture Parameters for Decoding
  11465. +//
  11466. +
  11467. +typedef struct _CUVIDPICPARAMS
  11468. +{
  11469. + int PicWidthInMbs; // Coded Frame Size
  11470. + int FrameHeightInMbs; // Coded Frame Height
  11471. + int CurrPicIdx; // Output index of the current picture
  11472. + int field_pic_flag; // 0=frame picture, 1=field picture
  11473. + int bottom_field_flag; // 0=top field, 1=bottom field (ignored if field_pic_flag=0)
  11474. + int second_field; // Second field of a complementary field pair
  11475. + // Bitstream data
  11476. + unsigned int nBitstreamDataLen; // Number of bytes in bitstream data buffer
  11477. + const unsigned char *pBitstreamData; // Ptr to bitstream data for this picture (slice-layer)
  11478. + unsigned int nNumSlices; // Number of slices in this picture
  11479. + const unsigned int *pSliceDataOffsets; // nNumSlices entries, contains offset of each slice within the bitstream data buffer
  11480. + int ref_pic_flag; // This picture is a reference picture
  11481. + int intra_pic_flag; // This picture is entirely intra coded
  11482. + unsigned int Reserved[30]; // Reserved for future use
  11483. + // Codec-specific data
  11484. + union {
  11485. + CUVIDMPEG2PICPARAMS mpeg2; // Also used for MPEG-1
  11486. + CUVIDH264PICPARAMS h264;
  11487. + CUVIDVC1PICPARAMS vc1;
  11488. + CUVIDMPEG4PICPARAMS mpeg4;
  11489. + CUVIDJPEGPICPARAMS jpeg;
  11490. + unsigned int CodecReserved[1024];
  11491. + } CodecSpecific;
  11492. +} CUVIDPICPARAMS;
  11493. +
  11494. +
  11495. +////////////////////////////////////////////////////////////////////////////////////////////////
  11496. +//
  11497. +// Post-processing
  11498. +//
  11499. +
  11500. +typedef struct _CUVIDPROCPARAMS
  11501. +{
  11502. + int progressive_frame; // Input is progressive (deinterlace_mode will be ignored)
  11503. + int second_field; // Output the second field (ignored if deinterlace mode is Weave)
  11504. + int top_field_first; // Input frame is top field first (1st field is top, 2nd field is bottom)
  11505. + int unpaired_field; // Input only contains one field (2nd field is invalid)
  11506. + // The fields below are used for raw YUV input
  11507. + unsigned int reserved_flags; // Reserved for future use (set to zero)
  11508. + unsigned int reserved_zero; // Reserved (set to zero)
  11509. + unsigned long long raw_input_dptr; // Input CUdeviceptr for raw YUV extensions
  11510. + unsigned int raw_input_pitch; // pitch in bytes of raw YUV input (should be aligned appropriately)
  11511. + unsigned int raw_input_format; // Reserved for future use (set to zero)
  11512. + unsigned long long raw_output_dptr; // Reserved for future use (set to zero)
  11513. + unsigned int raw_output_pitch; // Reserved for future use (set to zero)
  11514. + unsigned int Reserved[48];
  11515. + void *Reserved3[3];
  11516. +} CUVIDPROCPARAMS;
  11517. +
  11518. +////////////////////////////////////////////////////////////////////////////////////////////////
  11519. +//
  11520. +// In order to maximize decode latencies, there should be always at least 2 pictures in the decode
  11521. +// queue at any time, in order to make sure that all decode engines are always busy.
  11522. +//
  11523. +// Overall data flow:
  11524. +// - cuvidCreateDecoder(...)
  11525. +// For each picture:
  11526. +// - cuvidDecodePicture(N)
  11527. +// - cuvidMapVideoFrame(N-4)
  11528. +// - do some processing in cuda
  11529. +// - cuvidUnmapVideoFrame(N-4)
  11530. +// - cuvidDecodePicture(N+1)
  11531. +// - cuvidMapVideoFrame(N-3)
  11532. +// ...
  11533. +// - cuvidDestroyDecoder(...)
  11534. +//
  11535. +// NOTE:
  11536. +// - In the current version, the cuda context MUST be created from a D3D device, using cuD3D9CtxCreate function.
  11537. +// For multi-threaded operation, the D3D device must also be created with the D3DCREATE_MULTITHREADED flag.
  11538. +// - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces)
  11539. +// - cuVidDecodePicture may block the calling thread if there are too many pictures pending
  11540. +// in the decode queue
  11541. +//
  11542. +////////////////////////////////////////////////////////////////////////////////////////////////
  11543. +
  11544. +// Create/Destroy the decoder object
  11545. +extern CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
  11546. +extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder);
  11547. +
  11548. +// Decode a single picture (field or frame)
  11549. +extern CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
  11550. +
  11551. +#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
  11552. +// Post-process and map a video frame for use in cuda
  11553. +extern CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx,
  11554. + unsigned int *pDevPtr, unsigned int *pPitch,
  11555. + CUVIDPROCPARAMS *pVPP);
  11556. +// Unmap a previously mapped video frame
  11557. +extern CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
  11558. +#endif
  11559. +
  11560. +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
  11561. +extern CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
  11562. + unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
  11563. +extern CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
  11564. +#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
  11565. +#define cuvidMapVideoFrame cuvidMapVideoFrame64
  11566. +#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64
  11567. +#endif
  11568. +#endif
  11569. +
  11570. +// Get the pointer to the d3d9 surface that is the decode RT
  11571. +extern CUresult CUDAAPI cuvidGetVideoFrameSurface(CUvideodecoder hDecoder, int nPicIdx, void **pSrcSurface);
  11572. +
  11573. +////////////////////////////////////////////////////////////////////////////////////////////////
  11574. +//
  11575. +// Context-locking: to facilitate multi-threaded implementations, the following 4 functions
  11576. +// provide a simple mutex-style host synchronization. If a non-NULL context is specified
  11577. +// in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given
  11578. +// context before making any cuda calls.
  11579. +// A multi-threaded application could create a lock associated with a context handle so that
  11580. +// multiple threads can safely share the same cuda context:
  11581. +// - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context
  11582. +// that can be passed to cuvidCtxLockCreate.
  11583. +// - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section.
  11584. +//
  11585. +// NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video
  11586. +// decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls).
  11587. +
  11588. +extern CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
  11589. +extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck);
  11590. +extern CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
  11591. +extern CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
  11592. +
  11593. +////////////////////////////////////////////////////////////////////////////////////////////////
  11594. +
  11595. +#if defined(__cplusplus)
  11596. +}
  11597. +
  11598. +// Auto-lock helper for C++ applications
  11599. +class CCtxAutoLock
  11600. +{
  11601. +private:
  11602. + CUvideoctxlock m_ctx;
  11603. +public:
  11604. + CCtxAutoLock(CUvideoctxlock ctx):m_ctx(ctx) { cuvidCtxLock(m_ctx,0); }
  11605. + ~CCtxAutoLock() { cuvidCtxUnlock(m_ctx,0); }
  11606. +};
  11607. +
  11608. +#endif /* __cplusplus */
  11609. +
  11610. +#endif // __CUDA_VIDEO_H__
  11611. diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/nvcuvid.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/nvcuvid.h
  11612. new file mode 100644
  11613. index 0000000..0b81ee4
  11614. --- /dev/null
  11615. +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/nvcuvid.h
  11616. @@ -0,0 +1,228 @@
  11617. +/*
  11618. + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
  11619. + *
  11620. + * NOTICE TO USER:
  11621. + *
  11622. + * This source code is subject to NVIDIA ownership rights under U.S. and
  11623. + * international Copyright laws. Users and possessors of this source code
  11624. + * are hereby granted a nonexclusive, royalty-free license to use this code
  11625. + * in individual and commercial software.
  11626. + *
  11627. + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
  11628. + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
  11629. + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
  11630. + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
  11631. + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  11632. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
  11633. + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  11634. + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  11635. + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  11636. + * OR PERFORMANCE OF THIS SOURCE CODE.
  11637. + *
  11638. + * U.S. Government End Users. This source code is a "commercial item" as
  11639. + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
  11640. + * "commercial computer software" and "commercial computer software
  11641. + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
  11642. + * and is provided to the U.S. Government only as a commercial end item.
  11643. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
  11644. + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
  11645. + * source code with only those rights set forth herein.
  11646. + *
  11647. + * Any use of this source code in individual and commercial software must
  11648. + * include, in the user documentation and internal comments to the code,
  11649. + * the above Disclaimer and U.S. Government End Users Notice.
  11650. + */
  11651. +
  11652. +#if !defined(__NVCUVID_H__)
  11653. +#define __NVCUVID_H__
  11654. +
  11655. +#include "cuviddec.h"
  11656. +
  11657. +#if defined(__cplusplus)
  11658. +extern "C" {
  11659. +#endif /* __cplusplus */
  11660. +
  11661. +////////////////////////////////////////////////////////////////////////////////////////////////
  11662. +//
  11663. +// High-level helper APIs for video sources
  11664. +//
  11665. +
  11666. +typedef void *CUvideosource;
  11667. +typedef void *CUvideoparser;
  11668. +typedef long long CUvideotimestamp;
  11669. +
  11670. +////////////////////////////////////////////////////////////////////////////////////////////////
  11671. +//
  11672. +// video data structures
  11673. +//
  11674. +
  11675. +// Video Source State
  11676. +typedef enum {
  11677. + cudaVideoState_Error = -1, // Error state (invalid source)
  11678. + cudaVideoState_Stopped = 0, // Source is stopped (or reached end-of-stream)
  11679. + cudaVideoState_Started = 1, // Source is running and delivering data
  11680. +} cudaVideoState;
  11681. +
  11682. +// Audio compression
  11683. +typedef enum {
  11684. + cudaAudioCodec_MPEG1=0, // MPEG-1 Audio
  11685. + cudaAudioCodec_MPEG2, // MPEG-2 Audio
  11686. + cudaAudioCodec_MP3, // MPEG-1 Layer III Audio
  11687. + cudaAudioCodec_AC3, // Dolby Digital (AC3) Audio
  11688. + cudaAudioCodec_LPCM, // PCM Audio
  11689. +} cudaAudioCodec;
  11690. +
  11691. +
  11692. +// Video format
  11693. +typedef struct
  11694. +{
  11695. + cudaVideoCodec codec; // Compression format
  11696. + struct {
  11697. + unsigned int numerator; // frame rate numerator (0 = unspecified or variable frame rate)
  11698. + unsigned int denominator; // frame rate denominator (0 = unspecified or variable frame rate)
  11699. + } frame_rate; // frame rate = numerator / denominator (for example: 30000/1001)
  11700. + int progressive_sequence; // 0=interlaced, 1=progressive
  11701. + unsigned int coded_width; // coded frame width
  11702. + unsigned int coded_height; // coded frame height
  11703. + struct { // area of the frame that should be displayed
  11704. + int left; // typical example:
  11705. + int top; // coded_width = 1920, coded_height = 1088
  11706. + int right; // display_area = { 0,0,1920,1080 }
  11707. + int bottom;
  11708. + } display_area;
  11709. + cudaVideoChromaFormat chroma_format; // Chroma format
  11710. + unsigned int bitrate; // video bitrate (bps, 0=unknown)
  11711. + struct { // Display Aspect Ratio = x:y (4:3, 16:9, etc)
  11712. + int x;
  11713. + int y;
  11714. + } display_aspect_ratio;
  11715. + struct {
  11716. + unsigned char video_format;
  11717. + unsigned char color_primaries;
  11718. + unsigned char transfer_characteristics;
  11719. + unsigned char matrix_coefficients;
  11720. + } video_signal_description;
  11721. + unsigned int seqhdr_data_length; // Additional bytes following (CUVIDEOFORMATEX)
  11722. +} CUVIDEOFORMAT;
  11723. +
  11724. +// Video format including raw sequence header information
  11725. +typedef struct
  11726. +{
  11727. + CUVIDEOFORMAT format;
  11728. + unsigned char raw_seqhdr_data[1024];
  11729. +} CUVIDEOFORMATEX;
  11730. +
  11731. +
  11732. +// Audio Format
  11733. +typedef struct
  11734. +{
  11735. + cudaAudioCodec codec; // Compression format
  11736. + unsigned int channels; // number of audio channels
  11737. + unsigned int samplespersec; // sampling frequency
  11738. + unsigned int bitrate; // For uncompressed, can also be used to determine bits per sample
  11739. + unsigned int reserved1;
  11740. + unsigned int reserved2;
  11741. +} CUAUDIOFORMAT;
  11742. +
  11743. +
  11744. +
  11745. +////////////////////////////////////////////////////////////////////////////////////////////////
  11746. +//
  11747. +// video source
  11748. +//
  11749. +
  11750. +// Data packet
  11751. +typedef enum {
  11752. + CUVID_PKT_ENDOFSTREAM = 0x01, // Set when this is the last packet for this stream
  11753. + CUVID_PKT_TIMESTAMP = 0x02, // Timestamp is valid
  11754. + CUVID_PKT_DISCONTINUITY = 0x04, // Set when a discontinuity has to be signalled
  11755. +} CUvideopacketflags;
  11756. +
  11757. +typedef struct _CUVIDSOURCEDATAPACKET
  11758. +{
  11759. + unsigned long flags; // Combination of CUVID_PKT_XXX flags
  11760. + unsigned long payload_size; // number of bytes in the payload (may be zero if EOS flag is set)
  11761. + const unsigned char *payload; // Pointer to packet payload data (may be NULL if EOS flag is set)
  11762. + CUvideotimestamp timestamp; // Presentation timestamp (10MHz clock), only valid if CUVID_PKT_TIMESTAMP flag is set
  11763. +} CUVIDSOURCEDATAPACKET;
  11764. +
  11765. +// Callback for packet delivery
  11766. +typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
  11767. +
  11768. +typedef struct _CUVIDSOURCEPARAMS
  11769. +{
  11770. + unsigned int ulClockRate; // Timestamp units in Hz (0=default=10000000Hz)
  11771. + unsigned int uReserved1[7]; // Reserved for future use - set to zero
  11772. + void *pUserData; // Parameter passed in to the data handlers
  11773. + PFNVIDSOURCECALLBACK pfnVideoDataHandler; // Called to deliver audio packets
  11774. + PFNVIDSOURCECALLBACK pfnAudioDataHandler; // Called to deliver video packets
  11775. + void *pvReserved2[8]; // Reserved for future use - set to NULL
  11776. +} CUVIDSOURCEPARAMS;
  11777. +
  11778. +typedef enum {
  11779. + CUVID_FMT_EXTFORMATINFO = 0x100, // Return extended format structure (CUVIDEOFORMATEX)
  11780. +} CUvideosourceformat_flags;
  11781. +
  11782. +#if !defined(__APPLE__)
  11783. +// Video file source
  11784. +CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
  11785. +CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
  11786. +CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
  11787. +CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
  11788. +cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
  11789. +CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
  11790. +CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
  11791. +#endif
  11792. +
  11793. +////////////////////////////////////////////////////////////////////////////////////////////////
  11794. +//
  11795. +// Video parser
  11796. +//
  11797. +
  11798. +typedef struct _CUVIDPARSERDISPINFO
  11799. +{
  11800. + int picture_index;
  11801. + int progressive_frame;
  11802. + int top_field_first;
  11803. + int repeat_first_field; // Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, -1=unpaired field)
  11804. + CUvideotimestamp timestamp;
  11805. +} CUVIDPARSERDISPINFO;
  11806. +
  11807. +//
  11808. +// Parser callbacks
  11809. +// The parser will call these synchronously from within cuvidParseVideoData(), whenever a picture is ready to
  11810. +// be decoded and/or displayed.
  11811. +//
  11812. +typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
  11813. +typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
  11814. +typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
  11815. +
  11816. +typedef struct _CUVIDPARSERPARAMS
  11817. +{
  11818. + cudaVideoCodec CodecType; // cudaVideoCodec_XXX
  11819. + unsigned int ulMaxNumDecodeSurfaces; // Max # of decode surfaces (parser will cycle through these)
  11820. + unsigned int ulClockRate; // Timestamp units in Hz (0=default=10000000Hz)
  11821. + unsigned int ulErrorThreshold; // % Error threshold (0-100) for calling pfnDecodePicture (100=always call pfnDecodePicture even if picture bitstream is fully corrupted)
  11822. + unsigned int ulMaxDisplayDelay; // Max display queue delay (improves pipelining of decode with display) - 0=no delay (recommended values: 2..4)
  11823. + unsigned int uReserved1[5]; // Reserved for future use - set to 0
  11824. + void *pUserData; // User data for callbacks
  11825. + PFNVIDSEQUENCECALLBACK pfnSequenceCallback; // Called before decoding frames and/or whenever there is a format change
  11826. + PFNVIDDECODECALLBACK pfnDecodePicture; // Called when a picture is ready to be decoded (decode order)
  11827. + PFNVIDDISPLAYCALLBACK pfnDisplayPicture; // Called whenever a picture is ready to be displayed (display order)
  11828. + void *pvReserved2[7]; // Reserved for future use - set to NULL
  11829. + CUVIDEOFORMATEX *pExtVideoInfo; // [Optional] sequence header data from system layer
  11830. +} CUVIDPARSERPARAMS;
  11831. +
  11832. +
  11833. +CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
  11834. +CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
  11835. +CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
  11836. +
  11837. +
  11838. +////////////////////////////////////////////////////////////////////////////////////////////////
  11839. +
  11840. +#if defined(__cplusplus)
  11841. +}
  11842. +#endif /* __cplusplus */
  11843. +
  11844. +#endif // __NVCUVID_H__
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement