Advertisement
Guest User

Untitled

a guest
Jul 1st, 2022
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 108.14 KB | None | 0 0
  1. From 20f59c593fbaf10192246ccc1f396d3a8af23c20 Mon Sep 17 00:00:00 2001
  2. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  3. Date: Tue, 21 Dec 2021 23:29:58 +0100
  4. Subject: [PATCH 01/12] radv: Add a 32bit memory type.
  5.  
  6. Got to put the commandbuffers & uploadbuffers there. With DGC
  7. those can be allocated by the application.
  8.  
  9. Excluding it from all other buffers/images to avoid using the
  10. precious 32bit address space.
  11. ---
  12. src/amd/vulkan/radv_android.c | 2 +-
  13. src/amd/vulkan/radv_device.c | 24 ++++++++++++++++++------
  14. src/amd/vulkan/radv_private.h | 2 ++
  15. 3 files changed, 21 insertions(+), 7 deletions(-)
  16.  
  17. diff --git a/src/amd/vulkan/radv_android.c b/src/amd/vulkan/radv_android.c
  18. index 03bc702f1e5..5dda8fa4e70 100644
  19. --- a/src/amd/vulkan/radv_android.c
  20. +++ b/src/amd/vulkan/radv_android.c
  21. @@ -146,7 +146,7 @@ radv_image_from_gralloc(VkDevice device_h, const VkImageCreateInfo *base_info,
  22. for (int i = 0; i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
  23. bool is_local = !!(device->physical_device->memory_properties.memoryTypes[i].propertyFlags &
  24. VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
  25. - if (is_local) {
  26. + if (is_local && (device->physical_device->memory_types_default & (1u << i))) {
  27. memory_type_index = i;
  28. break;
  29. }
  30. diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
  31. index e3df1f7d27b..86a8fa9eb35 100644
  32. --- a/src/amd/vulkan/radv_device.c
  33. +++ b/src/amd/vulkan/radv_device.c
  34. @@ -226,6 +226,13 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device)
  35. .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
  36. .heapIndex = vram_index >= 0 ? vram_index : visible_vram_index,
  37. };
  38. +
  39. + device->memory_domains[type_count] = RADEON_DOMAIN_VRAM;
  40. + device->memory_flags[type_count] = RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_32BIT;
  41. + device->memory_properties.memoryTypes[type_count++] = (VkMemoryType){
  42. + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
  43. + .heapIndex = vram_index >= 0 ? vram_index : visible_vram_index,
  44. + };
  45. }
  46.  
  47. if (gart_index >= 0) {
  48. @@ -263,9 +270,9 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device)
  49. for (int i = 0; i < device->memory_properties.memoryTypeCount; i++) {
  50. VkMemoryType mem_type = device->memory_properties.memoryTypes[i];
  51.  
  52. - if ((mem_type.propertyFlags &
  53. + if (((mem_type.propertyFlags &
  54. (VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) ||
  55. - mem_type.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
  56. + mem_type.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) && !(device->memory_flags[i] & RADEON_FLAG_32BIT)) {
  57.  
  58. VkMemoryPropertyFlags property_flags = mem_type.propertyFlags |
  59. VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD |
  60. @@ -281,6 +288,13 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device)
  61. }
  62. device->memory_properties.memoryTypeCount = type_count;
  63. }
  64. +
  65. + for (unsigned i = 0; i < type_count; ++i) {
  66. + if (device->memory_flags[i] & RADEON_FLAG_32BIT)
  67. + device->memory_types_32bit |= 1u << i;
  68. + else
  69. + device->memory_types_default |= 1u << i;
  70. + }
  71. }
  72.  
  73. static const char *
  74. @@ -5252,8 +5266,7 @@ radv_get_buffer_memory_requirements(struct radv_device *device, VkDeviceSize siz
  75. VkBufferCreateFlags flags, VkBufferCreateFlags usage,
  76. VkMemoryRequirements2 *pMemoryRequirements)
  77. {
  78. - pMemoryRequirements->memoryRequirements.memoryTypeBits =
  79. - (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
  80. + pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_default;
  81.  
  82. if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
  83. pMemoryRequirements->memoryRequirements.alignment = 4096;
  84. @@ -5315,8 +5328,7 @@ radv_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequiremen
  85. RADV_FROM_HANDLE(radv_device, device, _device);
  86. RADV_FROM_HANDLE(radv_image, image, pInfo->image);
  87.  
  88. - pMemoryRequirements->memoryRequirements.memoryTypeBits =
  89. - (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
  90. + pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_default;
  91.  
  92. pMemoryRequirements->memoryRequirements.size = image->size;
  93. pMemoryRequirements->memoryRequirements.alignment = image->alignment;
  94. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
  95. index 42ba7ba0eaa..d2aa39dfe30 100644
  96. --- a/src/amd/vulkan/radv_private.h
  97. +++ b/src/amd/vulkan/radv_private.h
  98. @@ -315,6 +315,8 @@ struct radv_physical_device {
  99. enum radeon_bo_domain memory_domains[VK_MAX_MEMORY_TYPES];
  100. enum radeon_bo_flag memory_flags[VK_MAX_MEMORY_TYPES];
  101. unsigned heaps;
  102. + uint32_t memory_types_default;
  103. + uint32_t memory_types_32bit;
  104.  
  105. #ifndef _WIN32
  106. int available_nodes;
  107. --
  108. 2.36.1
  109.  
  110. From b797679fd8399a566bb625d59254a4d40a9cc795 Mon Sep 17 00:00:00 2001
  111. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  112. Date: Sun, 30 Jan 2022 01:54:12 +0100
  113. Subject: [PATCH 02/12] Skip setting empty index buffers to avoid hang
  114.  
  115. ---
  116. src/amd/vulkan/radv_cmd_buffer.c | 12 +++++++-----
  117. 1 file changed, 7 insertions(+), 5 deletions(-)
  118.  
  119. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  120. index 064da750784..a9f2970172f 100644
  121. --- a/src/amd/vulkan/radv_cmd_buffer.c
  122. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  123. @@ -2672,12 +2672,14 @@ radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
  124. if (!indirect)
  125. return;
  126.  
  127. - radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
  128. - radeon_emit(cs, state->index_va);
  129. - radeon_emit(cs, state->index_va >> 32);
  130. + if (state->max_index_count) {
  131. + radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
  132. + radeon_emit(cs, state->index_va);
  133. + radeon_emit(cs, state->index_va >> 32);
  134.  
  135. - radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
  136. - radeon_emit(cs, state->max_index_count);
  137. + radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
  138. + radeon_emit(cs, state->max_index_count);
  139. + }
  140.  
  141. cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
  142. }
  143. --
  144. 2.36.1
  145.  
  146. From 7aad4d28badd3a9ad8050876d4e0f4f0f60258f0 Mon Sep 17 00:00:00 2001
  147. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  148. Date: Sat, 1 Jan 2022 23:38:38 +0100
  149. Subject: [PATCH 03/12] radv: Expose function to write vertex descriptors for
  150. dgc.
  151.  
  152. ---
  153. src/amd/vulkan/radv_cmd_buffer.c | 251 ++++++++++++++++---------------
  154. src/amd/vulkan/radv_private.h | 2 +
  155. 2 files changed, 132 insertions(+), 121 deletions(-)
  156.  
  157. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  158. index a9f2970172f..30abd5c86be 100644
  159. --- a/src/amd/vulkan/radv_cmd_buffer.c
  160. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  161. @@ -3405,153 +3405,162 @@ static const uint32_t data_format_dst_sel[] = {
  162. [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
  163. };
  164.  
  165. -static void
  166. -radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
  167. +void
  168. +write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
  169. + const struct radv_graphics_pipeline *pipeline, void *vb_ptr)
  170. {
  171. - if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
  172. - cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
  173. - /* Mesh shaders don't have vertex descriptors. */
  174. - assert(!cmd_buffer->state.mesh_shading);
  175. + struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
  176. + enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
  177. + unsigned desc_index = 0;
  178. + uint32_t mask = pipeline->vb_desc_usage_mask;
  179. + uint64_t va;
  180. + const struct radv_vs_input_state *vs_state =
  181. + vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
  182. + assert(!vs_state || pipeline->use_per_attribute_vb_descs);
  183.  
  184. - struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
  185. - struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
  186. - enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
  187. - unsigned vb_offset;
  188. - void *vb_ptr;
  189. - unsigned desc_index = 0;
  190. - uint32_t mask = pipeline->vb_desc_usage_mask;
  191. - uint64_t va;
  192. - const struct radv_vs_input_state *vs_state =
  193. - vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
  194. + while (mask) {
  195. + unsigned i = u_bit_scan(&mask);
  196. + uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
  197. + uint32_t offset, rsrc_word3;
  198. + unsigned binding =
  199. + vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
  200. + : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
  201. + struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
  202. + unsigned num_records;
  203. + unsigned stride;
  204. +
  205. + if (vs_state) {
  206. + unsigned format = vs_state->formats[i];
  207. + unsigned dfmt = format & 0xf;
  208. + unsigned nfmt = (format >> 4) & 0x7;
  209. +
  210. + rsrc_word3 =
  211. + vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
  212. +
  213. + if (chip >= GFX10)
  214. + rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
  215. + else
  216. + rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
  217. + } else {
  218. + if (chip >= GFX10)
  219. + rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
  220. + else
  221. + rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
  222. + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
  223. + }
  224.  
  225. - /* allocate some descriptor state for vertex buffers */
  226. - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
  227. - return;
  228. + if (!buffer) {
  229. + if (vs_state) {
  230. + /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
  231. + * to include the format/word3 so that the alpha channel is 1 for formats without an
  232. + * alpha channel.
  233. + */
  234. + desc[0] = 0;
  235. + desc[1] = S_008F04_STRIDE(16);
  236. + desc[2] = 0;
  237. + desc[3] = rsrc_word3;
  238. + } else {
  239. + memset(desc, 0, 4 * 4);
  240. + }
  241. + continue;
  242. + }
  243.  
  244. - assert(!vs_state || pipeline->use_per_attribute_vb_descs);
  245. + va = radv_buffer_get_va(buffer->bo);
  246.  
  247. - while (mask) {
  248. - unsigned i = u_bit_scan(&mask);
  249. - uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
  250. - uint32_t offset, rsrc_word3;
  251. - unsigned binding =
  252. - vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
  253. - : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
  254. - struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
  255. - unsigned num_records;
  256. - unsigned stride;
  257. + offset = cmd_buffer->vertex_bindings[binding].offset;
  258. + va += offset + buffer->offset;
  259. + if (vs_state)
  260. + va += vs_state->offsets[i];
  261.  
  262. - if (vs_state) {
  263. - unsigned format = vs_state->formats[i];
  264. - unsigned dfmt = format & 0xf;
  265. - unsigned nfmt = (format >> 4) & 0x7;
  266. + if (cmd_buffer->vertex_bindings[binding].size) {
  267. + num_records = cmd_buffer->vertex_bindings[binding].size;
  268. + } else {
  269. + num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
  270. + }
  271.  
  272. - rsrc_word3 =
  273. - vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
  274. + if (pipeline->uses_dynamic_stride) {
  275. + stride = cmd_buffer->vertex_bindings[binding].stride;
  276. + } else {
  277. + stride = pipeline->binding_stride[binding];
  278. + }
  279.  
  280. - if (chip >= GFX10)
  281. - rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
  282. - else
  283. - rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
  284. + if (pipeline->use_per_attribute_vb_descs) {
  285. + uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
  286. + : pipeline->attrib_ends[i];
  287. +
  288. + if (num_records < attrib_end) {
  289. + num_records = 0; /* not enough space for one vertex */
  290. + } else if (stride == 0) {
  291. + num_records = 1; /* only one vertex */
  292. } else {
  293. - if (chip >= GFX10)
  294. - rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
  295. - else
  296. - rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
  297. - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
  298. + num_records = (num_records - attrib_end) / stride + 1;
  299. + /* If attrib_offset>stride, then the compiler will increase the vertex index by
  300. + * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
  301. + * only allowed with static strides.
  302. + */
  303. + num_records += pipeline->attrib_index_offset[i];
  304. }
  305.  
  306. - if (!buffer) {
  307. + /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
  308. + * into bytes in that case. GFX8 always uses bytes.
  309. + */
  310. + if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
  311. + num_records = (num_records - 1) * stride + attrib_end;
  312. + } else if (!num_records) {
  313. + /* On GFX9, it seems bounds checking is disabled if both
  314. + * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
  315. + * GFX10.3 but it doesn't hurt.
  316. + */
  317. if (vs_state) {
  318. - /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
  319. - * to include the format/word3 so that the alpha channel is 1 for formats without an
  320. - * alpha channel.
  321. - */
  322. desc[0] = 0;
  323. desc[1] = S_008F04_STRIDE(16);
  324. desc[2] = 0;
  325. desc[3] = rsrc_word3;
  326. } else {
  327. - memset(desc, 0, 4 * 4);
  328. + memset(desc, 0, 16);
  329. }
  330. continue;
  331. }
  332. + } else {
  333. + if (chip != GFX8 && stride)
  334. + num_records = DIV_ROUND_UP(num_records, stride);
  335. + }
  336.  
  337. - va = radv_buffer_get_va(buffer->bo);
  338. -
  339. - offset = cmd_buffer->vertex_bindings[binding].offset;
  340. - va += offset + buffer->offset;
  341. - if (vs_state)
  342. - va += vs_state->offsets[i];
  343. -
  344. - if (cmd_buffer->vertex_bindings[binding].size) {
  345. - num_records = cmd_buffer->vertex_bindings[binding].size;
  346. - } else {
  347. - num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
  348. - }
  349. -
  350. - if (pipeline->uses_dynamic_stride) {
  351. - stride = cmd_buffer->vertex_bindings[binding].stride;
  352. - } else {
  353. - stride = pipeline->binding_stride[binding];
  354. - }
  355. + if (chip >= GFX10) {
  356. + /* OOB_SELECT chooses the out-of-bounds check:
  357. + * - 1: index >= NUM_RECORDS (Structured)
  358. + * - 3: offset >= NUM_RECORDS (Raw)
  359. + */
  360. + int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
  361. + rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
  362. + }
  363.  
  364. - if (pipeline->use_per_attribute_vb_descs) {
  365. - uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
  366. - : pipeline->attrib_ends[i];
  367. + desc[0] = va;
  368. + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
  369. + desc[2] = num_records;
  370. + desc[3] = rsrc_word3;
  371. + }
  372. +}
  373.  
  374. - if (num_records < attrib_end) {
  375. - num_records = 0; /* not enough space for one vertex */
  376. - } else if (stride == 0) {
  377. - num_records = 1; /* only one vertex */
  378. - } else {
  379. - num_records = (num_records - attrib_end) / stride + 1;
  380. - /* If attrib_offset>stride, then the compiler will increase the vertex index by
  381. - * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
  382. - * only allowed with static strides.
  383. - */
  384. - num_records += pipeline->attrib_index_offset[i];
  385. - }
  386. +static void
  387. +radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
  388. +{
  389. + if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
  390. + cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
  391. + /* Mesh shaders don't have vertex descriptors. */
  392. + assert(!cmd_buffer->state.mesh_shading);
  393.  
  394. - /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
  395. - * into bytes in that case. GFX8 always uses bytes.
  396. - */
  397. - if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
  398. - num_records = (num_records - 1) * stride + attrib_end;
  399. - } else if (!num_records) {
  400. - /* On GFX9, it seems bounds checking is disabled if both
  401. - * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
  402. - * GFX10.3 but it doesn't hurt.
  403. - */
  404. - if (vs_state) {
  405. - desc[0] = 0;
  406. - desc[1] = S_008F04_STRIDE(16);
  407. - desc[2] = 0;
  408. - desc[3] = rsrc_word3;
  409. - } else {
  410. - memset(desc, 0, 16);
  411. - }
  412. - continue;
  413. - }
  414. - } else {
  415. - if (chip != GFX8 && stride)
  416. - num_records = DIV_ROUND_UP(num_records, stride);
  417. - }
  418. + struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
  419. + unsigned vb_offset;
  420. + void *vb_ptr;
  421. + uint64_t va;
  422.  
  423. - if (chip >= GFX10) {
  424. - /* OOB_SELECT chooses the out-of-bounds check:
  425. - * - 1: index >= NUM_RECORDS (Structured)
  426. - * - 3: offset >= NUM_RECORDS (Raw)
  427. - */
  428. - int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
  429. - rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
  430. - }
  431. + /* allocate some descriptor state for vertex buffers */
  432. + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset,
  433. + &vb_ptr))
  434. + return;
  435.  
  436. - desc[0] = va;
  437. - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
  438. - desc[2] = num_records;
  439. - desc[3] = rsrc_word3;
  440. - }
  441. + write_vertex_descriptors(cmd_buffer, pipeline, vb_ptr);
  442.  
  443. va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
  444. va += vb_offset;
  445. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
  446. index d2aa39dfe30..8d3b1eb1afb 100644
  447. --- a/src/amd/vulkan/radv_private.h
  448. +++ b/src/amd/vulkan/radv_private.h
  449. @@ -1656,6 +1656,8 @@ void radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
  450. const struct radv_subpass *subpass);
  451. bool radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size,
  452. const void *data, unsigned *out_offset);
  453. +void write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
  454. + const struct radv_graphics_pipeline *pipeline, void *vb_ptr);
  455.  
  456. void radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer *cmd_buffer);
  457. void radv_cmd_buffer_resolve_subpass(struct radv_cmd_buffer *cmd_buffer);
  458. --
  459. 2.36.1
  460.  
  461. From fd645676eaed6b9528ba7408d5c20aeaed4295ad Mon Sep 17 00:00:00 2001
  462. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  463. Date: Mon, 7 Feb 2022 02:04:56 +0100
  464. Subject: [PATCH 04/12] radv: Always store stride in the vbo descriptor.
  465.  
  466. So we can use it in the DGC shader.
  467. ---
  468. src/amd/vulkan/radv_cmd_buffer.c | 24 ++++++++----------------
  469. 1 file changed, 8 insertions(+), 16 deletions(-)
  470.  
  471. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  472. index 30abd5c86be..5a64749c625 100644
  473. --- a/src/amd/vulkan/radv_cmd_buffer.c
  474. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  475. @@ -3449,19 +3449,21 @@ write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
  476. S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
  477. }
  478.  
  479. + if (pipeline->uses_dynamic_stride) {
  480. + stride = cmd_buffer->vertex_bindings[binding].stride;
  481. + } else {
  482. + stride = pipeline->binding_stride[binding];
  483. + }
  484. +
  485. if (!buffer) {
  486. - if (vs_state) {
  487. /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
  488. * to include the format/word3 so that the alpha channel is 1 for formats without an
  489. * alpha channel.
  490. */
  491. desc[0] = 0;
  492. - desc[1] = S_008F04_STRIDE(16);
  493. + desc[1] = S_008F04_STRIDE(stride);
  494. desc[2] = 0;
  495. desc[3] = rsrc_word3;
  496. - } else {
  497. - memset(desc, 0, 4 * 4);
  498. - }
  499. continue;
  500. }
  501.  
  502. @@ -3478,12 +3480,6 @@ write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
  503. num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
  504. }
  505.  
  506. - if (pipeline->uses_dynamic_stride) {
  507. - stride = cmd_buffer->vertex_bindings[binding].stride;
  508. - } else {
  509. - stride = pipeline->binding_stride[binding];
  510. - }
  511. -
  512. if (pipeline->use_per_attribute_vb_descs) {
  513. uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
  514. : pipeline->attrib_ends[i];
  515. @@ -3511,14 +3507,10 @@ write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
  516. * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
  517. * GFX10.3 but it doesn't hurt.
  518. */
  519. - if (vs_state) {
  520. desc[0] = 0;
  521. - desc[1] = S_008F04_STRIDE(16);
  522. + desc[1] = S_008F04_STRIDE(stride);
  523. desc[2] = 0;
  524. desc[3] = rsrc_word3;
  525. - } else {
  526. - memset(desc, 0, 16);
  527. - }
  528. continue;
  529. }
  530. } else {
  531. --
  532. 2.36.1
  533.  
  534. From 8c5047236608abf1f352a39d81d99212bf13b2eb Mon Sep 17 00:00:00 2001
  535. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  536. Date: Mon, 7 Feb 2022 02:08:51 +0100
  537. Subject: [PATCH 05/12] radv: Require 32bit memory for indirect buffers.
  538.  
  539. ---
  540. src/amd/vulkan/radv_device.c | 3 +++
  541. 1 file changed, 3 insertions(+)
  542.  
  543. diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
  544. index 86a8fa9eb35..378d82c5765 100644
  545. --- a/src/amd/vulkan/radv_device.c
  546. +++ b/src/amd/vulkan/radv_device.c
  547. @@ -5267,6 +5267,9 @@ radv_get_buffer_memory_requirements(struct radv_device *device, VkDeviceSize siz
  548. VkMemoryRequirements2 *pMemoryRequirements)
  549. {
  550. pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_default;
  551. + if (usage & VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT)
  552. + pMemoryRequirements->memoryRequirements.memoryTypeBits |=
  553. + device->physical_device->memory_types_32bit;
  554.  
  555. if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
  556. pMemoryRequirements->memoryRequirements.alignment = 4096;
  557. --
  558. 2.36.1
  559.  
  560. From 82c2fd5183e6aeb8829905456a0080d58965f7c4 Mon Sep 17 00:00:00 2001
  561. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  562. Date: Tue, 28 Jun 2022 00:37:02 +0200
  563. Subject: [PATCH 06/12] radv: Expose helper for base pa_su_sc_mode_cntl.
  564.  
  565. ---
  566. src/amd/vulkan/radv_cmd_buffer.c | 12 +++++++++---
  567. src/amd/vulkan/radv_private.h | 1 +
  568. 2 files changed, 10 insertions(+), 3 deletions(-)
  569.  
  570. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  571. index 5a64749c625..4569788997c 100644
  572. --- a/src/amd/vulkan/radv_cmd_buffer.c
  573. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  574. @@ -1558,11 +1558,10 @@ radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
  575. S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
  576. }
  577.  
  578. -static void
  579. -radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
  580. +uint32_t radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
  581. {
  582. unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
  583. - struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
  584. + const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
  585.  
  586. pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
  587. C_028814_CULL_BACK &
  588. @@ -1577,6 +1576,13 @@ radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
  589. S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
  590. S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
  591. S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
  592. + return pa_su_sc_mode_cntl;
  593. +}
  594. +
  595. +static void
  596. +radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
  597. +{
  598. + unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
  599.  
  600. radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
  601. }
  602. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
  603. index 8d3b1eb1afb..9a6790090dc 100644
  604. --- a/src/amd/vulkan/radv_private.h
  605. +++ b/src/amd/vulkan/radv_private.h
  606. @@ -1643,6 +1643,7 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uin
  607. void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
  608.  
  609. void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries);
  610. +uint32_t radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer);
  611.  
  612. unsigned radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs);
  613. uint32_t radv_hash_vs_prolog(const void *key_);
  614. --
  615. 2.36.1
  616.  
  617. From 66539189d78eff85ba6a339f92fc6b8c4c976997 Mon Sep 17 00:00:00 2001
  618. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  619. Date: Tue, 28 Jun 2022 00:14:49 +0200
  620. Subject: [PATCH 07/12] radv: Add flushing for DGC.
  621.  
  622. ---
  623. src/amd/vulkan/radv_cmd_buffer.c | 8 +++++---
  624. 1 file changed, 5 insertions(+), 3 deletions(-)
  625.  
  626. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  627. index 4569788997c..eb68eaa53d1 100644
  628. --- a/src/amd/vulkan/radv_cmd_buffer.c
  629. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  630. @@ -4109,9 +4109,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_fla
  631. {
  632. switch ((VkAccessFlags2)(1 << b)) {
  633. case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
  634. - /* SMEM loads are used to read compute dispatch size in shaders */
  635. - if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
  636. - flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
  637. + /* SCACHE potentially for reading the dispatch size from the shader. The
  638. + * rest is for the DGC shader input. */
  639. + flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
  640. + if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
  641. + flush_bits |= RADV_CMD_FLAG_INV_L2;
  642. break;
  643. case VK_ACCESS_2_INDEX_READ_BIT:
  644. case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
  645. --
  646. 2.36.1
  647.  
  648. From e561ef7c88923b0d5671666cdcb85a15f13f90fa Mon Sep 17 00:00:00 2001
  649. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  650. Date: Mon, 7 Feb 2022 03:28:01 +0100
  651. Subject: [PATCH 08/12] radv: Add DGC meta shader.
  652.  
  653. This generated the cmd and upload buffers.
  654. ---
  655. src/amd/vulkan/meson.build | 1 +
  656. .../vulkan/radv_device_generated_commands.c | 892 ++++++++++++++++++
  657. src/amd/vulkan/radv_meta.c | 7 +
  658. src/amd/vulkan/radv_meta.h | 3 +
  659. src/amd/vulkan/radv_private.h | 6 +
  660. 5 files changed, 909 insertions(+)
  661. create mode 100644 src/amd/vulkan/radv_device_generated_commands.c
  662.  
  663. diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
  664. index 75f0685a77a..ebe7cb087b4 100644
  665. --- a/src/amd/vulkan/meson.build
  666. +++ b/src/amd/vulkan/meson.build
  667. @@ -49,6 +49,7 @@ libradv_files = files(
  668. 'radv_device.c',
  669. 'radv_descriptor_set.c',
  670. 'radv_descriptor_set.h',
  671. + 'radv_device_generated_commands.c',
  672. 'radv_formats.c',
  673. 'radv_image.c',
  674. 'radv_meta.c',
  675. diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c
  676. new file mode 100644
  677. index 00000000000..68d8e4d5060
  678. --- /dev/null
  679. +++ b/src/amd/vulkan/radv_device_generated_commands.c
  680. @@ -0,0 +1,892 @@
  681. +/*
  682. + * Copyright © 2021 Google
  683. + *
  684. + * Permission is hereby granted, free of charge, to any person obtaining a
  685. + * copy of this software and associated documentation files (the "Software"),
  686. + * to deal in the Software without restriction, including without limitation
  687. + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  688. + * and/or sell copies of the Software, and to permit persons to whom the
  689. + * Software is furnished to do so, subject to the following conditions:
  690. + *
  691. + * The above copyright notice and this permission notice (including the next
  692. + * paragraph) shall be included in all copies or substantial portions of the
  693. + * Software.
  694. + *
  695. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  696. + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  697. + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  698. + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  699. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  700. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  701. + * IN THE SOFTWARE.
  702. + */
  703. +
  704. +#include "radv_meta.h"
  705. +#include "radv_private.h"
  706. +
  707. +#include "nir_builder.h"
  708. +
  709. +enum radv_dgc_token_type {
  710. + RADV_DGC_INDEX_BUFFER,
  711. + RADV_DGC_DRAW,
  712. + RADV_DGC_INDEXED_DRAW,
  713. +};
  714. +
  715. +struct radv_dgc_token {
  716. + uint16_t type; /* enum radv_dgc_token_type, but making the size explicit */
  717. + uint16_t offset; /* offset in the input stream */
  718. + union {
  719. + struct {
  720. + uint16_t vtx_base_sgpr;
  721. + } draw;
  722. + struct {
  723. + uint16_t index_size;
  724. + uint16_t vtx_base_sgpr;
  725. + uint32_t max_index_count;
  726. + } indexed_draw;
  727. + };
  728. +};
  729. +
  730. +struct radv_dgc_params {
  731. + uint32_t cmd_buf_stride;
  732. + uint32_t cmd_buf_size;
  733. + uint32_t upload_stride;
  734. + uint32_t upload_addr;
  735. + uint32_t sequence_count;
  736. + uint32_t stream_stride;
  737. +
  738. + /* draw info */
  739. + uint16_t draw_indexed;
  740. + uint16_t draw_params_offset;
  741. + uint16_t base_index_size;
  742. + uint16_t vtx_base_sgpr;
  743. + uint32_t max_index_count;
  744. +
  745. + /* bind index buffer info. Valid if base_index_size == 0 && draw_indexed */
  746. + uint16_t index_buffer_offset;
  747. +
  748. + uint8_t vbo_cnt;
  749. + uint8_t const_copy;
  750. +
  751. + /* Which VBOs are set in this indirect layout. */
  752. + uint32_t vbo_bind_mask;
  753. +
  754. + uint16_t vbo_reg;
  755. + uint16_t const_copy_size;
  756. +
  757. + uint64_t push_constant_mask;
  758. +
  759. + uint32_t ibo_type_32;
  760. + uint32_t ibo_type_8;
  761. +
  762. + uint16_t push_constant_shader_cnt;
  763. +
  764. + uint16_t emit_state;
  765. + uint32_t pa_su_sc_mode_cntl_base;
  766. + uint16_t state_offset;
  767. + uint16_t scissor_count;
  768. + uint16_t scissor_offset; /* in parameter buffer. */
  769. +};
  770. +
  771. +#define load_param32(b, field) \
  772. + nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
  773. + .base = offsetof(struct radv_dgc_params, field), .range = 4)
  774. +
  775. +#define load_param16(b, field) \
  776. + nir_ubfe( \
  777. + (b), \
  778. + nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
  779. + .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
  780. + nir_imm_int((b), (offsetof(struct radv_dgc_params, field) & 2) * 8), nir_imm_int((b), 16))
  781. +
  782. +#define load_param8(b, field) \
  783. + nir_ubfe( \
  784. + (b), \
  785. + nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
  786. + .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
  787. + nir_imm_int((b), (offsetof(struct radv_dgc_params, field) & 3) * 8), nir_imm_int((b), 8))
  788. +
  789. +#define load_param64(b, field) \
  790. + nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \
  791. + .base = offsetof(struct radv_dgc_params, field), .range = 8))
  792. +
  793. +static nir_ssa_def *
  794. +nir_pkt3(nir_builder *b, unsigned op, nir_ssa_def *len)
  795. +{
  796. + len = nir_iand_imm(b, len, 0x3fff);
  797. + return nir_ior(b, nir_imm_int(b, PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op)),
  798. + nir_ishl(b, len, nir_imm_int(b, 16)));
  799. +}
  800. +
  801. +static nir_ssa_def *
  802. +dgc_emit_userdata_vertex(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *vtx_base_sgpr,
  803. + nir_ssa_def *first_vertex, nir_ssa_def *first_instance, nir_ssa_def *drawid)
  804. +{
  805. + vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
  806. + nir_ssa_def *has_drawid =
  807. + nir_ine(b, nir_iand_imm(b, vtx_base_sgpr, 1u << 14), nir_imm_int(b, 0));
  808. + nir_ssa_def *has_baseinstance =
  809. + nir_ine(b, nir_iand_imm(b, vtx_base_sgpr, 1u << 15), nir_imm_int(b, 0));
  810. +
  811. + nir_ssa_def *pkt_cnt = nir_imm_int(b, 1);
  812. + pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
  813. + pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
  814. +
  815. + nir_ssa_def *values[5] = {
  816. + nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), first_vertex,
  817. + nir_imm_int(b, PKT3_NOP_PAD), nir_imm_int(b, PKT3_NOP_PAD),
  818. + };
  819. +
  820. + values[3] = nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance),
  821. + nir_bcsel(b, has_drawid, drawid, first_instance), values[4]);
  822. + values[4] = nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, values[4]);
  823. +
  824. + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
  825. +
  826. + nir_store_ssbo(b, nir_vec(b, values, 4), dst_buf, offset, .write_mask = 0xf,
  827. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  828. + nir_store_ssbo(b, nir_vec(b, values + 4, 1), dst_buf, nir_iadd_imm(b, offset, 16),
  829. + .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
  830. + return nir_iadd_imm(b, offset, 20);
  831. +}
  832. +
  833. +static nir_ssa_def *
  834. +dgc_emit_instance_count(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *instance_count)
  835. +{
  836. + nir_ssa_def *values[2] = {nir_imm_int(b, PKT3(PKT3_NUM_INSTANCES, 0, false)), instance_count};
  837. +
  838. + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
  839. +
  840. + nir_store_ssbo(b, nir_vec(b, values, 2), dst_buf, offset, .write_mask = 0x3,
  841. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  842. + return nir_iadd_imm(b, offset, 8);
  843. +}
  844. +
  845. +static nir_ssa_def *
  846. +dgc_emit_draw_indexed(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index_offset,
  847. + nir_ssa_def *index_count, nir_ssa_def *max_index_count)
  848. +{
  849. + nir_ssa_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, false)),
  850. + max_index_count, index_offset, index_count,
  851. + nir_imm_int(b, V_0287F0_DI_SRC_SEL_DMA)};
  852. +
  853. + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
  854. +
  855. + nir_store_ssbo(b, nir_vec(b, values, 4), dst_buf, offset, .write_mask = 0xf,
  856. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  857. + nir_store_ssbo(b, nir_vec(b, values + 4, 1), dst_buf, nir_iadd_imm(b, offset, 16),
  858. + .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
  859. + return nir_iadd_imm(b, offset, 20);
  860. +}
  861. +
  862. +static nir_ssa_def *
  863. +dgc_emit_draw(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *vertex_count)
  864. +{
  865. + nir_ssa_def *values[3] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_AUTO, 1, false)), vertex_count,
  866. + nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX)};
  867. +
  868. + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
  869. +
  870. + nir_store_ssbo(b, nir_vec(b, values, 3), dst_buf, offset, .write_mask = 0x7,
  871. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  872. + return nir_iadd_imm(b, offset, 12);
  873. +}
  874. +
  875. +static void
  876. +build_dgc_buffer_tail(nir_builder *b, nir_ssa_def *sequence_count)
  877. +{
  878. + nir_ssa_def *global_id = get_global_ids(b, 1);
  879. +
  880. + nir_ssa_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
  881. + nir_ssa_def *cmd_buf_size = load_param32(b, cmd_buf_size);
  882. +
  883. + nir_push_if(b, nir_ieq(b, global_id, nir_imm_int(b, 0)));
  884. + {
  885. + nir_ssa_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);
  886. +
  887. + nir_variable *offset =
  888. + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
  889. + nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
  890. +
  891. + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
  892. + nir_push_loop(b);
  893. + {
  894. + nir_ssa_def *curr_offset = nir_load_var(b, offset);
  895. +
  896. + nir_push_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
  897. + {
  898. + nir_jump(b, nir_jump_break);
  899. + }
  900. + nir_pop_if(b, NULL);
  901. +
  902. + nir_ssa_def *packet_size = nir_isub(b, cmd_buf_size, curr_offset);
  903. + packet_size = nir_umin(b, packet_size, nir_imm_int(b, 0x3ffc * 4));
  904. +
  905. + nir_ssa_def *len = nir_ushr_imm(b, packet_size, 2);
  906. + len = nir_iadd_imm(b, len, -2);
  907. + nir_ssa_def *packet = nir_pkt3(b, PKT3_NOP, len);
  908. +
  909. + nir_store_ssbo(b, packet, dst_buf, curr_offset, .write_mask = 0x1,
  910. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  911. + nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
  912. + }
  913. + nir_pop_loop(b, NULL);
  914. + }
  915. + nir_pop_if(b, NULL);
  916. +}
  917. +
  918. +static nir_shader *
  919. +build_dgc_prepare_shader(struct radv_device *dev)
  920. +{
  921. + nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
  922. + b.shader->info.workgroup_size[0] = 64;
  923. +
  924. + nir_ssa_def *global_id = get_global_ids(&b, 1);
  925. +
  926. + nir_ssa_def *sequence_id = global_id;
  927. +
  928. + nir_ssa_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
  929. + nir_ssa_def *sequence_count = load_param32(&b, sequence_count);
  930. + nir_ssa_def *stream_stride = load_param32(&b, stream_stride);
  931. +
  932. + nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
  933. + nir_store_var(&b, count_var, sequence_count, 0x1);
  934. +
  935. + nir_push_if(&b, nir_ieq_imm(&b, sequence_count, UINT32_MAX));
  936. + {
  937. + nir_ssa_def *count_buf = radv_meta_load_descriptor(&b, 0, 4);
  938. + nir_ssa_def *cnt = nir_load_ssbo(&b, 1, 32, count_buf, nir_imm_int(&b, 0), .align_mul = 4);
  939. + nir_store_var(&b, count_var, cnt, 0x1);
  940. + }
  941. + nir_pop_if(&b, NULL);
  942. +
  943. + sequence_count = nir_load_var(&b, count_var);
  944. +
  945. + nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
  946. + {
  947. + nir_variable *cmd_buf_offset =
  948. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset");
  949. + nir_store_var(&b, cmd_buf_offset, nir_imul(&b, global_id, cmd_buf_stride), 1);
  950. + nir_ssa_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf_offset), cmd_buf_stride);
  951. +
  952. + nir_ssa_def *stream_buf = radv_meta_load_descriptor(&b, 0, 1);
  953. + nir_ssa_def *stream_base = nir_imul(&b, sequence_id, stream_stride);
  954. +
  955. + nir_variable *upload_offset =
  956. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset");
  957. + nir_store_var(&b, upload_offset,
  958. + nir_iadd(&b, load_param32(&b, cmd_buf_size),
  959. + nir_imul(&b, load_param32(&b, upload_stride), sequence_id)),
  960. + 0x1);
  961. +
  962. + nir_ssa_def *vbo_bind_mask = load_param32(&b, vbo_bind_mask);
  963. + nir_ssa_def *vbo_cnt = load_param8(&b, vbo_cnt);
  964. + nir_push_if(&b, nir_ine(&b, vbo_bind_mask, nir_imm_int(&b, 0)));
  965. + {
  966. + nir_variable *vbo_idx =
  967. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
  968. + nir_store_var(&b, vbo_idx, nir_imm_int(&b, 0), 0x1);
  969. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  970. +
  971. + nir_push_loop(&b);
  972. + {
  973. + nir_push_if(&b, nir_uge(&b, nir_load_var(&b, vbo_idx), vbo_cnt));
  974. + {
  975. + nir_jump(&b, nir_jump_break);
  976. + }
  977. + nir_pop_if(&b, NULL);
  978. +
  979. + nir_ssa_def *vbo_offset = nir_imul(&b, nir_load_var(&b, vbo_idx), nir_imm_int(&b, 16));
  980. + nir_variable *vbo_data =
  981. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");
  982. +
  983. + nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, 3);
  984. + nir_store_var(&b, vbo_data,
  985. + nir_load_ssbo(&b, 4, 32, param_buf, vbo_offset, .align_mul = 4), 0xf);
  986. +
  987. + nir_ssa_def *vbo_override =
  988. + nir_ine(&b,
  989. + nir_iand(&b, vbo_bind_mask,
  990. + nir_ishl(&b, nir_imm_int(&b, 1), nir_load_var(&b, vbo_idx))),
  991. + nir_imm_int(&b, 0));
  992. + nir_push_if(&b, vbo_override);
  993. + {
  994. + nir_ssa_def *vbo_offset_offset =
  995. + nir_iadd(&b, nir_imul(&b, vbo_cnt, nir_imm_int(&b, 16)),
  996. + nir_imul(&b, nir_load_var(&b, vbo_idx), nir_imm_int(&b, 8)));
  997. + nir_ssa_def *vbo_over_data =
  998. + nir_load_ssbo(&b, 2, 32, param_buf, vbo_offset_offset, .align_mul = 4);
  999. + nir_ssa_def *stream_offset = nir_iadd(
  1000. + &b, stream_base, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 0x7FFF));
  1001. + nir_ssa_def *stream_data =
  1002. + nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4);
  1003. +
  1004. + nir_ssa_def *va = nir_pack_64_2x32(&b, nir_channels(&b, stream_data, 0x3));
  1005. + nir_ssa_def *size = nir_channel(&b, stream_data, 2);
  1006. + nir_ssa_def *stride = nir_channel(&b, stream_data, 3);
  1007. +
  1008. + nir_ssa_def *vs_state_offset = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 0), nir_imm_int(&b, 16), nir_imm_int(&b, 15));
  1009. + va = nir_iadd(&b, va, nir_u2u64(&b, vs_state_offset));
  1010. +
  1011. + nir_ssa_def *dyn_stride =
  1012. + nir_ine(&b, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 1u << 15),
  1013. + nir_imm_int(&b, 0));
  1014. + nir_ssa_def *old_stride =
  1015. + nir_ubfe(&b, nir_channel(&b, nir_load_var(&b, vbo_data), 1), nir_imm_int(&b, 16),
  1016. + nir_imm_int(&b, 14));
  1017. + stride = nir_bcsel(&b, dyn_stride, stride, old_stride);
  1018. +
  1019. + nir_ssa_def *use_per_attribute_vb_descs =
  1020. + nir_ine(&b, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 1u << 31),
  1021. + nir_imm_int(&b, 0));
  1022. + nir_variable *num_records = nir_variable_create(b.shader, nir_var_shader_temp,
  1023. + glsl_uint_type(), "num_records");
  1024. + nir_store_var(&b, num_records, size, 0x1);
  1025. +
  1026. + nir_push_if(&b, use_per_attribute_vb_descs);
  1027. + {
  1028. + nir_ssa_def *attrib_end = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 1),
  1029. + nir_imm_int(&b, 16), nir_imm_int(&b, 16));
  1030. + nir_ssa_def *attrib_index_offset =
  1031. + nir_ubfe(&b, nir_channel(&b, vbo_over_data, 1), nir_imm_int(&b, 0),
  1032. + nir_imm_int(&b, 16));
  1033. +
  1034. + nir_push_if(&b, nir_ult(&b, nir_load_var(&b, num_records), attrib_end));
  1035. + {
  1036. + nir_store_var(&b, num_records, nir_imm_int(&b, 0), 0x1);
  1037. + }
  1038. + nir_push_else(&b, NULL);
  1039. + nir_push_if(&b, nir_ieq_imm(&b, stride, 0));
  1040. + {
  1041. + nir_store_var(&b, num_records, nir_imm_int(&b, 1), 0x1);
  1042. + }
  1043. + nir_push_else(&b, NULL);
  1044. + {
  1045. + nir_ssa_def *r = nir_iadd(
  1046. + &b,
  1047. + nir_iadd_imm(
  1048. + &b,
  1049. + nir_udiv(&b, nir_isub(&b, nir_load_var(&b, num_records), attrib_end),
  1050. + stride),
  1051. + 1),
  1052. + attrib_index_offset);
  1053. + nir_store_var(&b, num_records, r, 0x1);
  1054. + }
  1055. + nir_pop_if(&b, NULL);
  1056. + nir_pop_if(&b, NULL);
  1057. +
  1058. + nir_ssa_def *convert_cond =
  1059. + nir_ine(&b, nir_load_var(&b, num_records), nir_imm_int(&b, 0));
  1060. + if (dev->physical_device->rad_info.gfx_level == GFX9)
  1061. + convert_cond = nir_imm_bool(&b, false);
  1062. + else if (dev->physical_device->rad_info.gfx_level != GFX8)
  1063. + convert_cond =
  1064. + nir_iand(&b, convert_cond, nir_ieq_imm(&b, stride, 0));
  1065. +
  1066. + nir_ssa_def *new_records = nir_iadd(
  1067. + &b, nir_imul(&b, nir_iadd_imm(&b, nir_load_var(&b, num_records), -1), stride),
  1068. + attrib_end);
  1069. + new_records =
  1070. + nir_bcsel(&b, convert_cond, new_records, nir_load_var(&b, num_records));
  1071. + nir_store_var(&b, num_records, new_records, 0x1);
  1072. + }
  1073. + nir_push_else(&b, NULL);
  1074. + {
  1075. + if (dev->physical_device->rad_info.gfx_level != GFX8) {
  1076. + nir_push_if(&b, nir_ine(&b, stride, nir_imm_int(&b, 0)));
  1077. + {
  1078. + nir_ssa_def *r = nir_iadd(&b, nir_load_var(&b, num_records),
  1079. + nir_iadd_imm(&b, stride, -1));
  1080. + nir_store_var(&b, num_records, nir_udiv(&b, r, stride), 0x1);
  1081. + }
  1082. + nir_pop_if(&b, NULL);
  1083. + }
  1084. + }
  1085. + nir_pop_if(&b, NULL);
  1086. +
  1087. + nir_ssa_def *rsrc_word3 = nir_channel(&b, nir_load_var(&b, vbo_data), 3);
  1088. + if (dev->physical_device->rad_info.gfx_level >= GFX10) {
  1089. + nir_ssa_def *oob_select = nir_bcsel(
  1090. + &b, nir_ieq_imm(&b, stride, 0), nir_imm_int(&b, V_008F0C_OOB_SELECT_RAW),
  1091. + nir_imm_int(&b, V_008F0C_OOB_SELECT_STRUCTURED));
  1092. + rsrc_word3 = nir_iand_imm(&b, rsrc_word3, C_008F0C_OOB_SELECT);
  1093. + rsrc_word3 =
  1094. + nir_ior(&b, rsrc_word3, nir_ishl(&b, oob_select, nir_imm_int(&b, 28)));
  1095. + }
  1096. +
  1097. + nir_ssa_def *va_hi = nir_iand_imm(&b, nir_unpack_64_2x32_split_y(&b, va), 0xFFFF);
  1098. + stride = nir_iand_imm(&b, stride, 0x3FFF);
  1099. + nir_ssa_def *new_vbo_data[4] = {
  1100. + nir_unpack_64_2x32_split_x(&b, va),
  1101. + nir_ior(&b, nir_ishl(&b, stride, nir_imm_int(&b, 16)), va_hi),
  1102. + nir_load_var(&b, num_records), rsrc_word3};
  1103. + nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf);
  1104. + }
  1105. + nir_pop_if(&b, NULL);
  1106. +
  1107. + nir_ssa_def *upload_off = nir_iadd(&b, nir_load_var(&b, upload_offset), vbo_offset);
  1108. + nir_store_ssbo(&b, nir_load_var(&b, vbo_data), cmd_buf, upload_off, .write_mask = 0xf,
  1109. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  1110. + nir_store_var(&b, vbo_idx, nir_iadd_imm(&b, nir_load_var(&b, vbo_idx), 1), 0x1);
  1111. + }
  1112. + nir_pop_loop(&b, NULL);
  1113. + nir_ssa_def *packet[3] = {
  1114. + nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)), load_param16(&b, vbo_reg),
  1115. + nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))};
  1116. +
  1117. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1118. + nir_store_ssbo(&b, nir_vec(&b, packet, 3), cmd_buf, off, .write_mask = 0x7,
  1119. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  1120. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 12), 0x1);
  1121. +
  1122. + nir_store_var(&b, upload_offset, nir_iadd(&b, nir_load_var(&b, upload_offset), nir_imul(&b, vbo_cnt, nir_imm_int(&b, 16))), 0x1);
  1123. + }
  1124. + nir_pop_if(&b, NULL);
  1125. +
  1126. +
  1127. + nir_ssa_def *push_const_mask = load_param64(&b, push_constant_mask);
  1128. + nir_push_if(&b, nir_ine(&b, push_const_mask, nir_imm_int64(&b, 0)));
  1129. + {
  1130. + nir_ssa_def *const_copy = nir_ine(&b, load_param8(&b, const_copy), nir_imm_int(&b, 0));
  1131. + nir_ssa_def *const_copy_size = load_param16(&b, const_copy_size);
  1132. + nir_ssa_def *const_copy_words = nir_ushr_imm(&b, const_copy_size, 2);
  1133. + const_copy_words = nir_bcsel(&b, const_copy, const_copy_words, nir_imm_int(&b, 0));
  1134. +
  1135. + nir_variable *idx =
  1136. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "const_copy_idx");
  1137. + nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1);
  1138. +
  1139. + nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, 3);
  1140. + nir_ssa_def *param_offset = nir_imul(&b, vbo_cnt, nir_imm_int(&b, 24));
  1141. + nir_ssa_def *param_offset_offset = nir_iadd_imm(&b, param_offset, MESA_VULKAN_SHADER_STAGES * 12);
  1142. + nir_ssa_def *param_const_offset = nir_iadd_imm(&b, param_offset, MAX_PUSH_CONSTANTS_SIZE + MESA_VULKAN_SHADER_STAGES * 12);
  1143. + nir_push_loop(&b);
  1144. + {
  1145. + nir_ssa_def *cur_idx = nir_load_var(&b, idx);
  1146. + nir_push_if(&b, nir_uge(&b, cur_idx, const_copy_words));
  1147. + {
  1148. + nir_jump(&b, nir_jump_break);
  1149. + }
  1150. + nir_pop_if(&b, NULL);
  1151. +
  1152. + nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
  1153. +
  1154. + nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx));
  1155. + update = nir_bcsel(&b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64)), update, nir_imm_int64(&b, 0));
  1156. +
  1157. + nir_push_if(&b, nir_ine(&b, update, nir_imm_int64(&b, 0)));
  1158. + {
  1159. + nir_ssa_def *stream_offset = nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_offset_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4);
  1160. + nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset), .align_mul = 4);
  1161. + nir_store_var(&b, data, new_data, 0x1);
  1162. + }
  1163. + nir_push_else(&b, NULL);
  1164. + {
  1165. + nir_store_var(&b, data, nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_const_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4), 0x1);
  1166. + }
  1167. + nir_pop_if(&b, NULL);
  1168. +
  1169. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1170. + nir_store_ssbo(&b, nir_load_var(&b, data), cmd_buf, nir_iadd(&b, nir_load_var(&b, upload_offset), nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
  1171. +
  1172. + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
  1173. + }
  1174. + nir_pop_loop(&b, NULL);
  1175. +
  1176. + nir_variable *shader_idx =
  1177. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "shader_idx");
  1178. + nir_store_var(&b, shader_idx, nir_imm_int(&b, 0), 0x1);
  1179. + nir_ssa_def *shader_cnt = load_param16(&b, push_constant_shader_cnt);
  1180. +
  1181. + nir_push_loop(&b);
  1182. + {
  1183. + nir_ssa_def *cur_shader_idx = nir_load_var(&b, shader_idx);
  1184. + nir_push_if(&b, nir_uge(&b, cur_shader_idx, shader_cnt));
  1185. + {
  1186. + nir_jump(&b, nir_jump_break);
  1187. + }
  1188. + nir_pop_if(&b, NULL);
  1189. +
  1190. + nir_ssa_def *reg_info = nir_load_ssbo(&b, 3, 32, param_buf, nir_iadd(&b, param_offset, nir_imul_imm(&b, cur_shader_idx, 12)), .align_mul = 4);
  1191. + nir_ssa_def *upload_sgpr = nir_ubfe(&b, nir_channel(&b, reg_info, 0), nir_imm_int(&b, 0), nir_imm_int(&b, 16));
  1192. + nir_ssa_def *inline_sgpr = nir_ubfe(&b, nir_channel(&b, reg_info, 0), nir_imm_int(&b, 16), nir_imm_int(&b, 16));
  1193. + nir_ssa_def *inline_mask = nir_pack_64_2x32(&b, nir_channels(&b, reg_info, 0x6));
  1194. +
  1195. + nir_push_if(&b, nir_ine(&b, upload_sgpr, nir_imm_int(&b, 0)));
  1196. + {
  1197. + nir_ssa_def *pkt[3] = {
  1198. + nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)),
  1199. + upload_sgpr,
  1200. + nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))
  1201. + };
  1202. +
  1203. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1204. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1205. + nir_store_ssbo(&b, nir_vec(&b, pkt, 3), cmd_buf, off, .write_mask = 0x7, .access = ACCESS_NON_READABLE, .align_mul = 4);
  1206. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 12), 0x1);
  1207. + }
  1208. + nir_pop_if(&b, NULL);
  1209. +
  1210. + nir_push_if(&b, nir_ine(&b, inline_sgpr, nir_imm_int(&b, 0)));
  1211. + {
  1212. + nir_ssa_def *inline_len = nir_bit_count(&b, inline_mask);
  1213. + nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1);
  1214. +
  1215. + nir_ssa_def *pkt[2] = {
  1216. + nir_pkt3(&b, PKT3_SET_SH_REG, inline_len),
  1217. + inline_sgpr
  1218. + };
  1219. +
  1220. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1221. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1222. + nir_store_ssbo(&b, nir_vec(&b, pkt, 2), cmd_buf, off, .write_mask = 0x3, .access = ACCESS_NON_READABLE, .align_mul = 4);
  1223. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 8), 0x1);
  1224. +
  1225. + nir_push_loop(&b);
  1226. + {
  1227. + nir_ssa_def *cur_idx = nir_load_var(&b, idx);
  1228. + nir_push_if(&b, nir_uge(&b, cur_idx, nir_imm_int(&b, 64)));
  1229. + {
  1230. + nir_jump(&b, nir_jump_break);
  1231. + }
  1232. + nir_pop_if(&b, NULL);
  1233. +
  1234. + nir_ssa_def *l = nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx);
  1235. + nir_push_if(&b,nir_ieq(&b, nir_iand(&b, l, inline_mask), nir_imm_int64(&b, 0)));
  1236. + {
  1237. + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
  1238. + nir_jump(&b, nir_jump_continue);
  1239. + }
  1240. + nir_pop_if(&b, NULL);
  1241. +
  1242. + nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
  1243. +
  1244. + nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx));
  1245. + update = nir_bcsel(&b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64)), update, nir_imm_int64(&b, 0));
  1246. +
  1247. + nir_push_if(&b, nir_ine(&b, update, nir_imm_int64(&b, 0)));
  1248. + {
  1249. + nir_ssa_def *stream_offset = nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_offset_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4);
  1250. + nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset), .align_mul = 4);
  1251. + nir_store_var(&b, data, new_data, 0x1);
  1252. + }
  1253. + nir_push_else(&b, NULL);
  1254. + {
  1255. + nir_store_var(&b, data, nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_const_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4), 0x1);
  1256. + }
  1257. + nir_pop_if(&b, NULL);
  1258. +
  1259. + off = nir_load_var(&b, cmd_buf_offset);
  1260. + nir_store_ssbo(&b, nir_load_var(&b, data), cmd_buf, off, .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
  1261. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 4), 0x1);
  1262. +
  1263. + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
  1264. + }
  1265. + nir_pop_loop(&b, NULL);
  1266. + }
  1267. + nir_pop_if(&b, NULL);
  1268. + nir_store_var(&b, shader_idx, nir_iadd_imm(&b, cur_shader_idx, 1), 0x1);
  1269. + }
  1270. + nir_pop_loop(&b, NULL);
  1271. + }
  1272. + nir_pop_if(&b, 0);
  1273. +
  1274. + nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, emit_state), 1));
  1275. + {
  1276. + nir_ssa_def *stream_offset = nir_iadd(&b, load_param16(&b, state_offset), stream_base);
  1277. + nir_ssa_def *state = nir_load_ssbo(&b, 1, 32, stream_buf, stream_offset, .align_mul = 4);
  1278. + state = nir_iand_imm(&b, state, 1);
  1279. +
  1280. + nir_ssa_def *reg =
  1281. + nir_ior(&b, load_param32(&b, pa_su_sc_mode_cntl_base), nir_ishl_imm(&b, state, 2));
  1282. +
  1283. + nir_ssa_def *cmd_values[3] = {
  1284. + nir_imm_int(&b, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)),
  1285. + nir_imm_int(&b, (R_028814_PA_SU_SC_MODE_CNTL - SI_CONTEXT_REG_OFFSET) >> 2), reg};
  1286. +
  1287. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1288. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1289. +
  1290. + nir_store_ssbo(&b, nir_vec(&b, cmd_values, 3), cmd_buf, off, .write_mask = 0x7,
  1291. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  1292. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 0xc), 0x1);
  1293. + }
  1294. + nir_pop_if(&b, NULL);
  1295. +
  1296. + nir_ssa_def *scissor_count = load_param16(&b, scissor_count);
  1297. + nir_push_if(&b, nir_ine(&b, scissor_count, nir_imm_int(&b, 0)));
  1298. + {
  1299. + nir_ssa_def *scissor_offset = load_param16(&b, scissor_offset);
  1300. + nir_variable *idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(),
  1301. + "scissor_copy_idx");
  1302. + nir_store_var(&b, idx, nir_imm_int(&b, 0), 1);
  1303. +
  1304. + nir_push_loop(&b);
  1305. + {
  1306. + nir_ssa_def *cur_idx = nir_load_var(&b, idx);
  1307. + nir_push_if(&b, nir_uge(&b, cur_idx, scissor_count));
  1308. + {
  1309. + nir_jump(&b, nir_jump_break);
  1310. + }
  1311. + nir_pop_if(&b, NULL);
  1312. +
  1313. + nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, 3);
  1314. + nir_ssa_def *param_offset = nir_iadd(&b, scissor_offset, nir_imul_imm(&b, cur_idx, 4));
  1315. + nir_ssa_def *value = nir_load_ssbo(&b, 1, 32, param_buf, param_offset, .align_mul = 4);
  1316. +
  1317. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1318. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1319. +
  1320. + nir_store_ssbo(&b, value, cmd_buf, off, .write_mask = 0x1,
  1321. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  1322. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 4), 0x1);
  1323. +
  1324. + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 1);
  1325. + }
  1326. + nir_pop_loop(&b, NULL);
  1327. + }
  1328. + nir_pop_if(&b, NULL);
  1329. +
  1330. + nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0));
  1331. + {
  1332. + nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr);
  1333. + nir_ssa_def *stream_offset =
  1334. + nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base);
  1335. +
  1336. + nir_ssa_def *draw_data0 =
  1337. + nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4);
  1338. + nir_ssa_def *vertex_count = nir_channel(&b, draw_data0, 0);
  1339. + nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1);
  1340. + nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 2);
  1341. + nir_ssa_def *first_instance = nir_channel(&b, draw_data0, 3);
  1342. +
  1343. + nir_push_if(&b, nir_iand(&b, nir_ine(&b, vertex_count, nir_imm_int(&b, 0)), nir_ine(&b, instance_count, nir_imm_int(&b, 0))));
  1344. + {
  1345. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1346. + off = dgc_emit_userdata_vertex(&b, off, vtx_base_sgpr, vertex_offset, first_instance, sequence_id);
  1347. + off = dgc_emit_instance_count(&b, off, instance_count);
  1348. + off = dgc_emit_draw(&b, off, vertex_count);
  1349. + nir_store_var(&b, cmd_buf_offset, off, 0x1);
  1350. + }
  1351. + nir_pop_if(&b, 0);
  1352. + }
  1353. + nir_push_else(&b, NULL);
  1354. + {
  1355. + nir_variable *index_size_var =
  1356. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "index_size");
  1357. + nir_store_var(&b, index_size_var, load_param16(&b, base_index_size), 0x1);
  1358. + nir_variable *max_index_count_var =
  1359. + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
  1360. + nir_store_var(&b, max_index_count_var, load_param32(&b, max_index_count), 0x1);
  1361. +
  1362. + nir_ssa_def *bind_index_buffer = nir_ieq_imm(&b, nir_load_var(&b, index_size_var), 0);
  1363. + nir_push_if(&b, bind_index_buffer);
  1364. + {
  1365. + nir_ssa_def *index_stream_offset =
  1366. + nir_iadd(&b, load_param16(&b, index_buffer_offset), stream_base);
  1367. + nir_ssa_def *data =
  1368. + nir_load_ssbo(&b, 4, 32, stream_buf, index_stream_offset, .align_mul = 4);
  1369. +
  1370. + nir_ssa_def *vk_index_type = nir_channel(&b, data, 3);
  1371. + nir_ssa_def *index_type = nir_bcsel(
  1372. + &b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_32)),
  1373. + nir_imm_int(&b, V_028A7C_VGT_INDEX_32), nir_imm_int(&b, V_028A7C_VGT_INDEX_16));
  1374. + index_type = nir_bcsel(&b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_8)),
  1375. + nir_imm_int(&b, V_028A7C_VGT_INDEX_8), index_type);
  1376. +
  1377. + nir_ssa_def *index_size = nir_iand_imm(
  1378. + &b, nir_ushr(&b, nir_imm_int(&b, 0x142), nir_ishr_imm(&b, index_type, 2)), 0xf);
  1379. + nir_store_var(&b, index_size_var, index_size, 0x1);
  1380. +
  1381. + nir_ssa_def *max_index_count = nir_udiv(&b, nir_channel(&b, data, 2), index_size);
  1382. + nir_store_var(&b, max_index_count_var, max_index_count, 0x1);
  1383. +
  1384. + nir_ssa_def *cmd_values[3 + 2 + 3];
  1385. +
  1386. + if (dev->physical_device->rad_info.gfx_level >= GFX9) {
  1387. + unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
  1388. + if (dev->physical_device->rad_info.gfx_level < GFX9 ||
  1389. + (dev->physical_device->rad_info.gfx_level == GFX9 &&
  1390. + dev->physical_device->rad_info.me_fw_version < 26))
  1391. + opcode = PKT3_SET_UCONFIG_REG;
  1392. + cmd_values[0] = nir_imm_int(&b, PKT3(opcode, 1, 0));
  1393. + cmd_values[1] = nir_imm_int(
  1394. + &b, (R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
  1395. + cmd_values[2] = index_type;
  1396. + } else {
  1397. + cmd_values[0] = nir_imm_int(&b, PKT3(PKT3_INDEX_TYPE, 0, 0));
  1398. + cmd_values[1] = index_type;
  1399. + cmd_values[2] = nir_imm_int(&b, PKT3_NOP_PAD);
  1400. + }
  1401. +
  1402. + nir_ssa_def *addr_upper = nir_channel(&b, data, 1);
  1403. + addr_upper = nir_ishr_imm(&b, nir_ishl(&b, addr_upper, nir_imm_int(&b, 16)), 16);
  1404. +
  1405. + cmd_values[3] = nir_imm_int(&b, PKT3(PKT3_INDEX_BASE, 1, 0));
  1406. + cmd_values[4] = nir_channel(&b, data, 0);
  1407. + cmd_values[5] = addr_upper;
  1408. + cmd_values[6] = nir_imm_int(&b, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
  1409. + cmd_values[7] = max_index_count;
  1410. +
  1411. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1412. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1413. + nir_store_ssbo(&b, nir_vec(&b, cmd_values, 4), cmd_buf, off, .write_mask = 0xf,
  1414. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  1415. + nir_store_ssbo(&b, nir_vec(&b, cmd_values + 4, 4), cmd_buf, nir_iadd_imm(&b, off, 16),
  1416. + .write_mask = 0xf, .access = ACCESS_NON_READABLE, .align_mul = 4);
  1417. + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 0x20), 0x1);
  1418. + }
  1419. + nir_pop_if(&b, NULL);
  1420. +
  1421. + nir_ssa_def *index_size = nir_load_var(&b, index_size_var);
  1422. + nir_ssa_def *max_index_count = nir_load_var(&b, max_index_count_var);
  1423. + nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr);
  1424. + nir_ssa_def *stream_offset =
  1425. + nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base);
  1426. +
  1427. + index_size =
  1428. + nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, index_size_var), index_size);
  1429. + max_index_count = nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, max_index_count_var),
  1430. + max_index_count);
  1431. + nir_ssa_def *draw_data0 =
  1432. + nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4);
  1433. + nir_ssa_def *draw_data1 = nir_load_ssbo(
  1434. + &b, 1, 32, stream_buf, nir_iadd_imm(&b, stream_offset, 16), .align_mul = 4);
  1435. + nir_ssa_def *index_count = nir_channel(&b, draw_data0, 0);
  1436. + nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1);
  1437. + nir_ssa_def *first_index = nir_channel(&b, draw_data0, 2);
  1438. + nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 3);
  1439. + nir_ssa_def *first_instance = nir_channel(&b, draw_data1, 0);
  1440. +
  1441. + nir_push_if(&b, nir_iand(&b, nir_ine(&b, index_count, nir_imm_int(&b, 0)), nir_ine(&b, instance_count, nir_imm_int(&b, 0))));
  1442. + {
  1443. + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
  1444. + off = dgc_emit_userdata_vertex(&b, off, vtx_base_sgpr, vertex_offset, first_instance, sequence_id);
  1445. + off = dgc_emit_instance_count(&b, off, instance_count);
  1446. + off = dgc_emit_draw_indexed(&b, off, first_index, index_count,
  1447. + max_index_count);
  1448. + nir_store_var(&b, cmd_buf_offset, off, 0x1);
  1449. + }
  1450. + nir_pop_if(&b, 0);
  1451. + }
  1452. + nir_pop_if(&b, NULL);
  1453. +
  1454. + /* Pad the cmdbuffer if we did not use the whole stride */
  1455. + nir_push_if(&b, nir_ine(&b, nir_load_var(&b, cmd_buf_offset), cmd_buf_end));
  1456. + {
  1457. + nir_ssa_def *cnt = nir_isub(&b, cmd_buf_end, nir_load_var(&b, cmd_buf_offset));
  1458. + cnt = nir_ushr_imm(&b, cnt, 2);
  1459. + cnt = nir_iadd_imm(&b, cnt, -2);
  1460. + nir_ssa_def *pkt = nir_pkt3(&b, PKT3_NOP, cnt);
  1461. +
  1462. + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
  1463. + nir_store_ssbo(&b, pkt, cmd_buf, nir_load_var(&b, cmd_buf_offset), .write_mask = 0x1,
  1464. + .access = ACCESS_NON_READABLE, .align_mul = 4);
  1465. + }
  1466. + nir_pop_if(&b, NULL);
  1467. + }
  1468. + nir_pop_if(&b, NULL);
  1469. +
  1470. + build_dgc_buffer_tail(&b, sequence_count);
  1471. + return b.shader;
  1472. +}
  1473. +
  1474. +void
  1475. +radv_device_finish_dgc_prepare_state(struct radv_device *device)
  1476. +{
  1477. + radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.dgc_prepare.pipeline,
  1478. + &device->meta_state.alloc);
  1479. + radv_DestroyPipelineLayout(radv_device_to_handle(device),
  1480. + device->meta_state.dgc_prepare.p_layout, &device->meta_state.alloc);
  1481. + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
  1482. + device->meta_state.dgc_prepare.ds_layout,
  1483. + &device->meta_state.alloc);
  1484. +}
  1485. +
  1486. +VkResult
  1487. +radv_device_init_dgc_prepare_state(struct radv_device *device)
  1488. +{
  1489. + VkResult result;
  1490. + nir_shader *cs = build_dgc_prepare_shader(device);
  1491. +
  1492. + VkDescriptorSetLayoutCreateInfo ds_create_info = {
  1493. + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
  1494. + .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
  1495. + .bindingCount = 5,
  1496. + .pBindings = (VkDescriptorSetLayoutBinding[]){
  1497. + {.binding = 0, // index
  1498. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  1499. + .descriptorCount = 1,
  1500. + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
  1501. + .pImmutableSamplers = NULL},
  1502. + {.binding = 1, // token stream
  1503. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  1504. + .descriptorCount = 1,
  1505. + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
  1506. + .pImmutableSamplers = NULL},
  1507. + {.binding = 2, // prepare buffer
  1508. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  1509. + .descriptorCount = 1,
  1510. + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
  1511. + .pImmutableSamplers = NULL},
  1512. + {.binding = 3, // params
  1513. + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
  1514. + .descriptorCount = 1,
  1515. + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
  1516. + .pImmutableSamplers = NULL},
  1517. + {.binding = 4, // count buffer
  1518. + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
  1519. + .descriptorCount = 1,
  1520. + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
  1521. + .pImmutableSamplers = NULL},
  1522. + }};
  1523. +
  1524. + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &ds_create_info,
  1525. + &device->meta_state.alloc,
  1526. + &device->meta_state.dgc_prepare.ds_layout);
  1527. + if (result != VK_SUCCESS)
  1528. + goto fail;
  1529. +
  1530. + const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
  1531. + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
  1532. + .setLayoutCount = 1,
  1533. + .pSetLayouts = &device->meta_state.dgc_prepare.ds_layout,
  1534. + .pushConstantRangeCount = 1,
  1535. + .pPushConstantRanges =
  1536. + &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct radv_dgc_params)},
  1537. + };
  1538. +
  1539. + result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,
  1540. + &device->meta_state.alloc,
  1541. + &device->meta_state.dgc_prepare.p_layout);
  1542. + if (result != VK_SUCCESS)
  1543. + goto fail;
  1544. +
  1545. + VkPipelineShaderStageCreateInfo shader_stage = {
  1546. + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
  1547. + .stage = VK_SHADER_STAGE_COMPUTE_BIT,
  1548. + .module = vk_shader_module_handle_from_nir(cs),
  1549. + .pName = "main",
  1550. + .pSpecializationInfo = NULL,
  1551. + };
  1552. +
  1553. + VkComputePipelineCreateInfo pipeline_info = {
  1554. + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
  1555. + .stage = shader_stage,
  1556. + .flags = 0,
  1557. + .layout = device->meta_state.dgc_prepare.p_layout,
  1558. + };
  1559. +
  1560. + result = radv_CreateComputePipelines(
  1561. + radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
  1562. + &pipeline_info, &device->meta_state.alloc, &device->meta_state.dgc_prepare.pipeline);
  1563. + if (result != VK_SUCCESS)
  1564. + goto fail;
  1565. +
  1566. + ralloc_free(cs);
  1567. + return VK_SUCCESS;
  1568. +fail:
  1569. + radv_device_finish_dgc_prepare_state(device);
  1570. + ralloc_free(cs);
  1571. + return result;
  1572. +}
  1573. diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c
  1574. index caba21759ab..35bf5ab5bba 100644
  1575. --- a/src/amd/vulkan/radv_meta.c
  1576. +++ b/src/amd/vulkan/radv_meta.c
  1577. @@ -622,10 +622,16 @@ radv_device_init_meta(struct radv_device *device)
  1578. if (result != VK_SUCCESS)
  1579. goto fail_etc_decode;
  1580.  
  1581. + result = radv_device_init_dgc_prepare_state(device);
  1582. + if (result != VK_SUCCESS)
  1583. + goto fail_dgc;
  1584. +
  1585. device->app_shaders_internal = false;
  1586.  
  1587. return VK_SUCCESS;
  1588.  
  1589. +fail_dgc:
  1590. + radv_device_finish_meta_etc_decode_state(device);
  1591. fail_etc_decode:
  1592. radv_device_finish_meta_fmask_copy_state(device);
  1593. fail_fmask_copy:
  1594. @@ -663,6 +669,7 @@ fail_clear:
  1595. void
  1596. radv_device_finish_meta(struct radv_device *device)
  1597. {
  1598. + radv_device_finish_dgc_prepare_state(device);
  1599. radv_device_finish_meta_etc_decode_state(device);
  1600. radv_device_finish_accel_struct_build_state(device);
  1601. radv_device_finish_meta_clear_state(device);
  1602. diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
  1603. index 0f9388acd98..6ba7b8d286e 100644
  1604. --- a/src/amd/vulkan/radv_meta.h
  1605. +++ b/src/amd/vulkan/radv_meta.h
  1606. @@ -107,6 +107,9 @@ void radv_device_finish_accel_struct_build_state(struct radv_device *device);
  1607. VkResult radv_device_init_meta_etc_decode_state(struct radv_device *device, bool on_demand);
  1608. void radv_device_finish_meta_etc_decode_state(struct radv_device *device);
  1609.  
  1610. +VkResult radv_device_init_dgc_prepare_state(struct radv_device *device);
  1611. +void radv_device_finish_dgc_prepare_state(struct radv_device *device);
  1612. +
  1613. void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer,
  1614. uint32_t flags);
  1615.  
  1616. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
  1617. index 9a6790090dc..6c1d78fec9e 100644
  1618. --- a/src/amd/vulkan/radv_private.h
  1619. +++ b/src/amd/vulkan/radv_private.h
  1620. @@ -689,6 +689,12 @@ struct radv_meta_state {
  1621. VkPipelineLayout p_layout;
  1622. VkPipeline pipeline;
  1623. } etc_decode;
  1624. +
  1625. + struct {
  1626. + VkDescriptorSetLayout ds_layout;
  1627. + VkPipelineLayout p_layout;
  1628. + VkPipeline pipeline;
  1629. + } dgc_prepare;
  1630. };
  1631.  
  1632. #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
  1633. --
  1634. 2.36.1
  1635.  
  1636. From d686009428f84a8753a96dfb6b4b8cac931b2bf0 Mon Sep 17 00:00:00 2001
  1637. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  1638. Date: Mon, 27 Jun 2022 23:21:08 +0200
  1639. Subject: [PATCH 09/12] radv: Implement DGC generated command layout structure.
  1640.  
  1641. ---
  1642. .../vulkan/radv_device_generated_commands.c | 185 ++++++++++++++++++
  1643. src/amd/vulkan/radv_private.h | 30 +++
  1644. 2 files changed, 215 insertions(+)
  1645.  
  1646. diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c
  1647. index 68d8e4d5060..c9838b4839b 100644
  1648. --- a/src/amd/vulkan/radv_device_generated_commands.c
  1649. +++ b/src/amd/vulkan/radv_device_generated_commands.c
  1650. @@ -26,6 +26,72 @@
  1651.  
  1652. #include "nir_builder.h"
  1653.  
  1654. +static void
  1655. +radv_get_sequence_size(const struct radv_indirect_command_layout *layout,
  1656. + const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size,
  1657. + uint32_t *upload_size)
  1658. +{
  1659. + *cmd_size = 0;
  1660. + *upload_size = 0;
  1661. +
  1662. + if (layout->bind_vbo_mask) {
  1663. + *upload_size += 16 * util_bitcount(pipeline->vb_desc_usage_mask);
  1664. + *cmd_size += 3 * 4;
  1665. + }
  1666. +
  1667. + if (layout->push_constant_mask) {
  1668. + bool need_copy = false;
  1669. +
  1670. + for (unsigned i = 0; i < ARRAY_SIZE(pipeline->base.shaders); ++i) {
  1671. + if (!pipeline->base.shaders[i])
  1672. + continue;
  1673. +
  1674. + struct radv_userdata_locations *locs = &pipeline->base.shaders[i]->info.user_sgprs_locs;
  1675. + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
  1676. + *cmd_size += 12;
  1677. + need_copy = true;
  1678. + }
  1679. + if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0)
  1680. + *cmd_size += 8 + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].num_sgprs * 4;
  1681. + }
  1682. + if (need_copy)
  1683. + *upload_size +=
  1684. + align(pipeline->base.push_constant_size + 16 * pipeline->base.dynamic_offset_count, 16);
  1685. + }
  1686. +
  1687. + if (layout->binds_index_buffer)
  1688. + *cmd_size += (3 + 2 + 3) * 4;
  1689. + if (layout->indexed)
  1690. + *cmd_size += (5 + 2 + 5) * 4;
  1691. + else
  1692. + *cmd_size += (5 + 2 + 3) * 4;
  1693. +
  1694. + if (layout->binds_state) {
  1695. + *cmd_size += 3 * 4;
  1696. +
  1697. + if (pipeline->base.device->physical_device->rad_info.has_gfx9_scissor_bug)
  1698. + *cmd_size += (8 + 2 * MAX_SCISSORS) * 4;
  1699. + }
  1700. +}
  1701. +
  1702. +static uint32_t
  1703. +radv_align_cmdbuf_size(uint32_t size)
  1704. +{
  1705. + return align(MAX2(1, size), 256);
  1706. +}
  1707. +
  1708. +uint32_t
  1709. +radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
  1710. +{
  1711. + VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
  1712. + VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
  1713. + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
  1714. +
  1715. + uint32_t cmd_size, upload_size;
  1716. + radv_get_sequence_size(layout, graphics_pipeline, &cmd_size, &upload_size);
  1717. + return radv_align_cmdbuf_size(cmd_size * cmd_info->sequencesCount);
  1718. +}
  1719. +
  1720. enum radv_dgc_token_type {
  1721. RADV_DGC_INDEX_BUFFER,
  1722. RADV_DGC_DRAW,
  1723. @@ -890,3 +956,122 @@ fail:
  1724. ralloc_free(cs);
  1725. return result;
  1726. }
  1727. +
  1728. +VkResult
  1729. +radv_CreateIndirectCommandsLayoutNV(VkDevice _device,
  1730. + const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo,
  1731. + const VkAllocationCallbacks *pAllocator,
  1732. + VkIndirectCommandsLayoutNV *pIndirectCommandsLayout)
  1733. +{
  1734. + RADV_FROM_HANDLE(radv_device, device, _device);
  1735. + struct radv_indirect_command_layout *layout;
  1736. +
  1737. + size_t size =
  1738. + sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);
  1739. +
  1740. + layout =
  1741. + vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout),
  1742. + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  1743. + if (!layout)
  1744. + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
  1745. +
  1746. + vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);
  1747. +
  1748. + layout->input_stride = pCreateInfo->pStreamStrides[0];
  1749. + layout->token_count = pCreateInfo->tokenCount;
  1750. + typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);
  1751. +
  1752. + layout->indexed = false;
  1753. + layout->binds_index_buffer = false;
  1754. + layout->bind_vbo_mask = 0;
  1755. + layout->push_constant_mask = 0;
  1756. +
  1757. + layout->ibo_type_32 = VK_INDEX_TYPE_UINT32;
  1758. + layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_EXT;
  1759. +
  1760. + for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) {
  1761. + switch (pCreateInfo->pTokens[i].tokenType) {
  1762. + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
  1763. + layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
  1764. + break;
  1765. + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
  1766. + layout->indexed = true;
  1767. + layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
  1768. + break;
  1769. + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
  1770. + layout->binds_index_buffer = true;
  1771. + layout->index_buffer_offset = pCreateInfo->pTokens[i].offset;
  1772. + /* 16-bit is implied if we find no match. */
  1773. + for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) {
  1774. + if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32)
  1775. + layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
  1776. + else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_EXT)
  1777. + layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
  1778. + }
  1779. + break;
  1780. + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
  1781. + layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit;
  1782. + layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] =
  1783. + pCreateInfo->pTokens[i].offset;
  1784. + if (pCreateInfo->pTokens[i].vertexDynamicStride)
  1785. + layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] |= 1u << 15;
  1786. + break;
  1787. + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV:
  1788. + for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0;
  1789. + k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) {
  1790. + layout->push_constant_mask |= 1ull << j;
  1791. + layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4;
  1792. + }
  1793. + break;
  1794. + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV:
  1795. + layout->binds_state = true;
  1796. + layout->state_offset = pCreateInfo->pTokens[i].offset;
  1797. + break;
  1798. + default:
  1799. + unreachable("Unhandled token type");
  1800. + }
  1801. + }
  1802. + if (!layout->indexed)
  1803. + layout->binds_index_buffer = false;
  1804. +
  1805. + *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
  1806. + return VK_SUCCESS;
  1807. +}
  1808. +
  1809. +void
  1810. +radv_DestroyIndirectCommandsLayoutNV(VkDevice _device,
  1811. + VkIndirectCommandsLayoutNV indirectCommandsLayout,
  1812. + const VkAllocationCallbacks *pAllocator)
  1813. +{
  1814. + RADV_FROM_HANDLE(radv_device, device, _device);
  1815. + VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
  1816. +
  1817. + if (!layout)
  1818. + return;
  1819. +
  1820. + vk_object_base_finish(&layout->base);
  1821. + vk_free2(&device->vk.alloc, pAllocator, layout);
  1822. +}
  1823. +
  1824. +void
  1825. +radv_GetGeneratedCommandsMemoryRequirementsNV(
  1826. + VkDevice _device, const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo,
  1827. + VkMemoryRequirements2 *pMemoryRequirements)
  1828. +{
  1829. + RADV_FROM_HANDLE(radv_device, device, _device);
  1830. + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
  1831. + VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
  1832. + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
  1833. +
  1834. + uint32_t cmd_stride, upload_stride;
  1835. + radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride);
  1836. +
  1837. + VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(cmd_stride * pInfo->maxSequencesCount);
  1838. + VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount;
  1839. +
  1840. + pMemoryRequirements->memoryRequirements.memoryTypeBits =
  1841. + device->physical_device->memory_types_32bit;
  1842. + pMemoryRequirements->memoryRequirements.alignment = 256;
  1843. + pMemoryRequirements->memoryRequirements.size =
  1844. + align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
  1845. +}
  1846. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
  1847. index 6c1d78fec9e..38693974243 100644
  1848. --- a/src/amd/vulkan/radv_private.h
  1849. +++ b/src/amd/vulkan/radv_private.h
  1850. @@ -2917,6 +2917,34 @@ void radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer);
  1851. void radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
  1852. const struct radv_barrier_data *barrier);
  1853.  
  1854. +struct radv_indirect_command_layout {
  1855. + struct vk_object_base base;
  1856. +
  1857. + uint32_t input_stride;
  1858. + uint32_t token_count;
  1859. +
  1860. + bool indexed;
  1861. + bool binds_index_buffer;
  1862. + bool binds_state;
  1863. + uint16_t draw_params_offset;
  1864. + uint16_t index_buffer_offset;
  1865. +
  1866. + uint16_t state_offset;
  1867. +
  1868. + uint32_t bind_vbo_mask;
  1869. + uint32_t vbo_offsets[MAX_VBS];
  1870. +
  1871. + uint64_t push_constant_mask;
  1872. + uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4];
  1873. +
  1874. + uint32_t ibo_type_32;
  1875. + uint32_t ibo_type_8;
  1876. +
  1877. + VkIndirectCommandsLayoutTokenNV tokens[0];
  1878. +};
  1879. +
  1880. +uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
  1881. +
  1882. uint64_t radv_get_current_time(void);
  1883.  
  1884. static inline uint32_t
  1885. @@ -3150,6 +3178,8 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(radv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
  1886. VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
  1887. VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image_view, vk.base, VkImageView,
  1888. VK_OBJECT_TYPE_IMAGE_VIEW);
  1889. +VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, base, VkIndirectCommandsLayoutNV,
  1890. + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV)
  1891. VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_cache, base, VkPipelineCache,
  1892. VK_OBJECT_TYPE_PIPELINE_CACHE)
  1893. VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline, base, VkPipeline,
  1894. --
  1895. 2.36.1
  1896.  
  1897. From 4f08d81a892e55f463611ee2544205c1a7da0945 Mon Sep 17 00:00:00 2001
  1898. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  1899. Date: Mon, 27 Jun 2022 23:28:14 +0200
  1900. Subject: [PATCH 10/12] radv: Implement DGC cmdbuffer generation.
  1901.  
  1902. ---
  1903. .../vulkan/radv_device_generated_commands.c | 287 ++++++++++++++++++
  1904. src/amd/vulkan/radv_private.h | 3 +
  1905. 2 files changed, 290 insertions(+)
  1906.  
  1907. diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c
  1908. index c9838b4839b..3a8aa562e3a 100644
  1909. --- a/src/amd/vulkan/radv_device_generated_commands.c
  1910. +++ b/src/amd/vulkan/radv_device_generated_commands.c
  1911. @@ -1075,3 +1075,290 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(
  1912. pMemoryRequirements->memoryRequirements.size =
  1913. align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
  1914. }
  1915. +
  1916. +static uint32_t
  1917. +radv_get_vgt_index_size(uint32_t type)
  1918. +{
  1919. + switch (type) {
  1920. + case V_028A7C_VGT_INDEX_8:
  1921. + return 1;
  1922. + case V_028A7C_VGT_INDEX_16:
  1923. + return 2;
  1924. + case V_028A7C_VGT_INDEX_32:
  1925. + return 4;
  1926. + default:
  1927. + unreachable("invalid index type");
  1928. + }
  1929. +}
  1930. +
  1931. +void
  1932. +radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
  1933. + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
  1934. +{
  1935. + /* Can't do anything here as we depend on some dynamic state in some cases that we only know
  1936. + * at draw time. */
  1937. +}
  1938. +
  1939. +/* Always need to call this directly before draw due to dependence on bound state. */
  1940. +void
  1941. +radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer,
  1942. + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
  1943. +{
  1944. + VK_FROM_HANDLE(radv_indirect_command_layout, layout,
  1945. + pGeneratedCommandsInfo->indirectCommandsLayout);
  1946. + VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
  1947. + VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
  1948. + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
  1949. + struct radv_meta_saved_state saved_state;
  1950. + struct radv_buffer token_buffer;
  1951. +
  1952. + if (cmd_buffer->device->meta_state.dgc_prepare.pipeline == VK_NULL_HANDLE) {
  1953. + radv_device_init_dgc_prepare_state(cmd_buffer->device);
  1954. + }
  1955. +
  1956. + uint32_t cmd_stride, upload_stride;
  1957. + radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride);
  1958. +
  1959. + unsigned cmd_buf_size =
  1960. + radv_align_cmdbuf_size(cmd_stride * pGeneratedCommandsInfo->sequencesCount);
  1961. +
  1962. + radv_meta_save(
  1963. + &saved_state, cmd_buffer,
  1964. + RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
  1965. +
  1966. + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
  1967. + cmd_buffer->device->meta_state.dgc_prepare.pipeline);
  1968. +
  1969. + unsigned vb_size = layout->bind_vbo_mask ? util_bitcount(graphics_pipeline->vb_desc_usage_mask) * 24 : 0;
  1970. + unsigned const_size = graphics_pipeline->base.push_constant_size +
  1971. + 16 * graphics_pipeline->base.dynamic_offset_count +
  1972. + sizeof(layout->push_constant_offsets) + ARRAY_SIZE(graphics_pipeline->base.shaders) * 12;
  1973. + if (!layout->push_constant_mask)
  1974. + const_size = 0;
  1975. +
  1976. + unsigned scissor_size = (8 + 2 * cmd_buffer->state.dynamic.scissor.count) * 4;
  1977. + if (!layout->binds_state || !cmd_buffer->state.dynamic.scissor.count ||
  1978. + !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
  1979. + scissor_size = 0;
  1980. +
  1981. + unsigned upload_size = MAX2(vb_size + const_size + scissor_size, 16);
  1982. +
  1983. + void *upload_data;
  1984. + unsigned upload_offset;
  1985. + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, upload_size, &upload_offset, &upload_data))
  1986. + abort();
  1987. +
  1988. + void *upload_data_base = upload_data;
  1989. +
  1990. + radv_buffer_init(&token_buffer, cmd_buffer->device, cmd_buffer->upload.upload_bo, upload_size,
  1991. + upload_offset);
  1992. +
  1993. + uint64_t upload_addr = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
  1994. + pGeneratedCommandsInfo->preprocessOffset;
  1995. +
  1996. + uint16_t vtx_base_sgpr =
  1997. + (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
  1998. + if (cmd_buffer->state.graphics_pipeline->uses_drawid)
  1999. + vtx_base_sgpr |= 1u << 14;
  2000. + if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
  2001. + vtx_base_sgpr |= 1u << 15;
  2002. +
  2003. + uint16_t vbo_sgpr =
  2004. + ((radv_lookup_user_sgpr(&graphics_pipeline->base, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS)->sgpr_idx * 4 +
  2005. + graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX]) -
  2006. + SI_SH_REG_OFFSET) >>
  2007. + 2;
  2008. + struct radv_dgc_params params = {
  2009. + .cmd_buf_stride = cmd_stride,
  2010. + .cmd_buf_size = cmd_buf_size,
  2011. + .upload_addr = (uint32_t)upload_addr,
  2012. + .upload_stride = upload_stride,
  2013. + .sequence_count = pGeneratedCommandsInfo->sequencesCount,
  2014. + .stream_stride = layout->input_stride,
  2015. + .draw_indexed = layout->indexed,
  2016. + .draw_params_offset = layout->draw_params_offset,
  2017. + .base_index_size =
  2018. + layout->binds_index_buffer ? 0 : radv_get_vgt_index_size(cmd_buffer->state.index_type),
  2019. + .vtx_base_sgpr = vtx_base_sgpr,
  2020. + .max_index_count = cmd_buffer->state.max_index_count,
  2021. + .index_buffer_offset = layout->index_buffer_offset,
  2022. + .vbo_reg = vbo_sgpr,
  2023. + .ibo_type_32 = layout->ibo_type_32,
  2024. + .ibo_type_8 = layout->ibo_type_8,
  2025. + .emit_state = layout->binds_state,
  2026. + .pa_su_sc_mode_cntl_base = radv_get_pa_su_sc_mode_cntl(cmd_buffer) & C_028814_FACE,
  2027. + .state_offset = layout->state_offset,
  2028. + };
  2029. +
  2030. + if (layout->bind_vbo_mask) {
  2031. + write_vertex_descriptors(cmd_buffer, graphics_pipeline, upload_data);
  2032. +
  2033. + uint32_t *vbo_info = (uint32_t *)((char *)upload_data + graphics_pipeline->vb_desc_alloc_size);
  2034. +
  2035. + struct radv_shader *vs_shader = radv_get_shader(&graphics_pipeline->base, MESA_SHADER_VERTEX);
  2036. + const struct radv_vs_input_state *vs_state =
  2037. + vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
  2038. + uint32_t mask = graphics_pipeline->vb_desc_usage_mask;
  2039. + unsigned idx = 0;
  2040. + while (mask) {
  2041. + unsigned i = u_bit_scan(&mask);
  2042. + unsigned binding =
  2043. + vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
  2044. + : (graphics_pipeline->use_per_attribute_vb_descs ? graphics_pipeline->attrib_bindings[i] : i);
  2045. + uint32_t attrib_end =
  2046. + vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : graphics_pipeline->attrib_ends[i];
  2047. +
  2048. + params.vbo_bind_mask |= ((layout->bind_vbo_mask >> binding) & 1u) << idx;
  2049. + vbo_info[2 * idx] = ((graphics_pipeline->use_per_attribute_vb_descs ? 1u : 0u) << 31) |
  2050. + (vs_state ? vs_state->offsets[i] << 16 : 0) |
  2051. + layout->vbo_offsets[binding];
  2052. + vbo_info[2 * idx + 1] = graphics_pipeline->attrib_index_offset[i] | (attrib_end << 16);
  2053. + ++idx;
  2054. + }
  2055. + params.vbo_cnt = idx;
  2056. + upload_data = (char *)upload_data + vb_size;
  2057. + }
  2058. +
  2059. + if (layout->push_constant_mask) {
  2060. + uint32_t *desc = upload_data;
  2061. + upload_data = (char *)upload_data + ARRAY_SIZE(graphics_pipeline->base.shaders) * 12;
  2062. +
  2063. + unsigned idx = 0;
  2064. + for (unsigned i = 0; i < ARRAY_SIZE(graphics_pipeline->base.shaders); ++i) {
  2065. + if (!graphics_pipeline->base.shaders[i])
  2066. + continue;
  2067. +
  2068. + struct radv_userdata_locations *locs = &graphics_pipeline->base.shaders[i]->info.user_sgprs_locs;
  2069. + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
  2070. + params.const_copy = 1;
  2071. +
  2072. + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
  2073. + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
  2074. + unsigned upload_sgpr = 0;
  2075. + unsigned inline_sgpr = 0;
  2076. +
  2077. + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
  2078. + upload_sgpr =
  2079. + (graphics_pipeline->base.user_data_0[i] + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
  2080. + SI_SH_REG_OFFSET) >>
  2081. + 2;
  2082. + }
  2083. +
  2084. + if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
  2085. + inline_sgpr = (graphics_pipeline->base.user_data_0[i] +
  2086. + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
  2087. + SI_SH_REG_OFFSET) >>
  2088. + 2;
  2089. + desc[idx * 3 + 1] = graphics_pipeline->base.shaders[i]->info.inline_push_constant_mask;
  2090. + desc[idx * 3 + 2] = graphics_pipeline->base.shaders[i]->info.inline_push_constant_mask >> 32;
  2091. + }
  2092. + desc[idx * 3] = upload_sgpr | (inline_sgpr << 16);
  2093. + ++idx;
  2094. + }
  2095. + }
  2096. +
  2097. + params.push_constant_shader_cnt = idx;
  2098. +
  2099. + params.const_copy_size = graphics_pipeline->base.push_constant_size +
  2100. + 16 * graphics_pipeline->base.dynamic_offset_count;
  2101. + params.push_constant_mask = layout->push_constant_mask;
  2102. +
  2103. + memcpy(upload_data, layout->push_constant_offsets, sizeof(layout->push_constant_offsets));
  2104. + upload_data = (char *)upload_data + sizeof(layout->push_constant_offsets);
  2105. +
  2106. + memcpy(upload_data, cmd_buffer->push_constants, graphics_pipeline->base.push_constant_size);
  2107. + upload_data = (char *)upload_data + graphics_pipeline->base.push_constant_size;
  2108. +
  2109. + struct radv_descriptor_state *descriptors_state =
  2110. + radv_get_descriptors_state(cmd_buffer, pGeneratedCommandsInfo->pipelineBindPoint);
  2111. + memcpy(upload_data, descriptors_state->dynamic_buffers, 16 * graphics_pipeline->base.dynamic_offset_count);
  2112. + }
  2113. +
  2114. + if (scissor_size) {
  2115. + params.scissor_offset = (char*)upload_data - (char*)upload_data_base;
  2116. + params.scissor_count = scissor_size / 4;
  2117. +
  2118. + struct radeon_cmdbuf scissor_cs = {
  2119. + .buf = upload_data,
  2120. + .cdw = 0,
  2121. + .max_dw = scissor_size / 4
  2122. + };
  2123. +
  2124. + si_write_scissors(&scissor_cs, 0, cmd_buffer->state.dynamic.scissor.count,
  2125. + cmd_buffer->state.dynamic.scissor.scissors,
  2126. + cmd_buffer->state.dynamic.viewport.viewports,
  2127. + cmd_buffer->state.emitted_graphics_pipeline->can_use_guardband);
  2128. + assert(scissor_cs.cdw * 4 == scissor_size);
  2129. + upload_data = (char *)upload_data + scissor_size;
  2130. + }
  2131. +
  2132. + VkWriteDescriptorSet ds_writes[5];
  2133. + VkDescriptorBufferInfo buf_info[ARRAY_SIZE(ds_writes)];
  2134. + int ds_cnt = 0;
  2135. + buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer),
  2136. + .offset = 0,
  2137. + .range = upload_size};
  2138. + ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
  2139. + .dstBinding = 3,
  2140. + .dstArrayElement = 0,
  2141. + .descriptorCount = 1,
  2142. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  2143. + .pBufferInfo = &buf_info[ds_cnt]};
  2144. + ++ds_cnt;
  2145. +
  2146. + buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->preprocessBuffer,
  2147. + .offset = pGeneratedCommandsInfo->preprocessOffset,
  2148. + .range = pGeneratedCommandsInfo->preprocessSize};
  2149. + ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
  2150. + .dstBinding = 2,
  2151. + .dstArrayElement = 0,
  2152. + .descriptorCount = 1,
  2153. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  2154. + .pBufferInfo = &buf_info[ds_cnt]};
  2155. + ++ds_cnt;
  2156. +
  2157. + if (pGeneratedCommandsInfo->streamCount > 0) {
  2158. + buf_info[ds_cnt] =
  2159. + (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->pStreams[0].buffer,
  2160. + .offset = pGeneratedCommandsInfo->pStreams[0].offset,
  2161. + .range = VK_WHOLE_SIZE};
  2162. + ds_writes[ds_cnt] =
  2163. + (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
  2164. + .dstBinding = 1,
  2165. + .dstArrayElement = 0,
  2166. + .descriptorCount = 1,
  2167. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  2168. + .pBufferInfo = &buf_info[ds_cnt]};
  2169. + ++ds_cnt;
  2170. + }
  2171. +
  2172. + if (pGeneratedCommandsInfo->sequencesCountBuffer != VK_NULL_HANDLE) {
  2173. + buf_info[ds_cnt] =
  2174. + (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->sequencesCountBuffer,
  2175. + .offset = pGeneratedCommandsInfo->sequencesCountOffset,
  2176. + .range = VK_WHOLE_SIZE};
  2177. + ds_writes[ds_cnt] =
  2178. + (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
  2179. + .dstBinding = 4,
  2180. + .dstArrayElement = 0,
  2181. + .descriptorCount = 1,
  2182. + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
  2183. + .pBufferInfo = &buf_info[ds_cnt]};
  2184. + ++ds_cnt;
  2185. + params.sequence_count = UINT32_MAX;
  2186. + }
  2187. +
  2188. + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
  2189. + cmd_buffer->device->meta_state.dgc_prepare.p_layout,
  2190. + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), &params);
  2191. +
  2192. + radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
  2193. + cmd_buffer->device->meta_state.dgc_prepare.p_layout, 0, ds_cnt,
  2194. + ds_writes);
  2195. +
  2196. + unsigned block_count = MAX2(1, round_up_u32(pGeneratedCommandsInfo->sequencesCount, 64));
  2197. + radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
  2198. +
  2199. + radv_buffer_finish(&token_buffer);
  2200. + radv_meta_restore(&saved_state, cmd_buffer);
  2201. +}
  2202. \ No newline at end of file
  2203. diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
  2204. index 38693974243..b4f088250ac 100644
  2205. --- a/src/amd/vulkan/radv_private.h
  2206. +++ b/src/amd/vulkan/radv_private.h
  2207. @@ -2945,6 +2945,9 @@ struct radv_indirect_command_layout {
  2208.  
  2209. uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
  2210.  
  2211. +void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer,
  2212. + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
  2213. +
  2214. uint64_t radv_get_current_time(void);
  2215.  
  2216. static inline uint32_t
  2217. --
  2218. 2.36.1
  2219.  
  2220. From c16a5dc84401af89283ecd3b8188cc30f788b94e Mon Sep 17 00:00:00 2001
  2221. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  2222. Date: Mon, 27 Jun 2022 23:29:04 +0200
  2223. Subject: [PATCH 11/12] radv: Implement CmdExecuteGeneratedCommandsNV.
  2224.  
  2225. ---
  2226. src/amd/vulkan/radv_cmd_buffer.c | 80 ++++++++++++++++++++++++++++++++
  2227. 1 file changed, 80 insertions(+)
  2228.  
  2229. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  2230. index eb68eaa53d1..be077eac52e 100644
  2231. --- a/src/amd/vulkan/radv_cmd_buffer.c
  2232. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  2233. @@ -2673,6 +2673,11 @@ radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
  2234. struct radeon_cmdbuf *cs = cmd_buffer->cs;
  2235. struct radv_cmd_state *state = &cmd_buffer->state;
  2236.  
  2237. + /* With indirect generated commands the index buffer bind may be part of the
  2238. + * indirect command buffer, in which case the app may not have bound any yet. */
  2239. + if (state->index_type < 0)
  2240. + return;
  2241. +
  2242. /* For the direct indexed draws we use DRAW_INDEX_2, which includes
  2243. * the index_va and max_index_count already. */
  2244. if (!indirect)
  2245. @@ -7375,6 +7380,81 @@ radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _bu
  2246. radv_after_draw(cmd_buffer);
  2247. }
  2248.  
  2249. +void
  2250. +radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
  2251. + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
  2252. +{
  2253. + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
  2254. + VK_FROM_HANDLE(radv_indirect_command_layout, layout,
  2255. + pGeneratedCommandsInfo->indirectCommandsLayout);
  2256. + VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
  2257. +
  2258. + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
  2259. + cmd_buffer->state.flush_bits |=
  2260. + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
  2261. +
  2262. + struct radv_draw_info info;
  2263. +
  2264. + info.count = pGeneratedCommandsInfo->sequencesCount;
  2265. + info.indirect = prep_buffer; /* We're not not really goint use it this way but a good signal
  2266. + that this is not direct. */
  2267. + info.indirect_offset = 0;
  2268. + info.stride = 0;
  2269. + info.strmout_buffer = NULL;
  2270. + info.count_buffer = NULL;
  2271. + info.indexed = layout->indexed;
  2272. + info.instance_count = 0;
  2273. +
  2274. + if (!radv_before_draw(cmd_buffer, &info, 1))
  2275. + return;
  2276. +
  2277. + uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
  2278. + uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
  2279. + pGeneratedCommandsInfo->preprocessOffset;
  2280. + const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
  2281. +
  2282. + if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
  2283. + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
  2284. + radeon_emit(cmd_buffer->cs, 0);
  2285. + }
  2286. + if (!view_mask) {
  2287. + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
  2288. + radeon_emit(cmd_buffer->cs, va);
  2289. + radeon_emit(cmd_buffer->cs, va >> 32);
  2290. + radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
  2291. + } else {
  2292. + u_foreach_bit(view, view_mask)
  2293. + {
  2294. + radv_emit_view_index(cmd_buffer, view);
  2295. +
  2296. + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
  2297. + radeon_emit(cmd_buffer->cs, va);
  2298. + radeon_emit(cmd_buffer->cs, va >> 32);
  2299. + radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
  2300. + }
  2301. + }
  2302. +
  2303. + if (layout->binds_index_buffer) {
  2304. + cmd_buffer->state.last_index_type = -1;
  2305. + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
  2306. + }
  2307. +
  2308. + if (layout->bind_vbo_mask)
  2309. + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
  2310. +
  2311. + cmd_buffer->push_constant_stages |= ~0;
  2312. +
  2313. + cmd_buffer->state.last_primitive_reset_en = -1;
  2314. + cmd_buffer->state.last_index_type = -1;
  2315. + cmd_buffer->state.last_num_instances = -1;
  2316. + cmd_buffer->state.last_vertex_offset = -1;
  2317. + cmd_buffer->state.last_first_instance = -1;
  2318. + cmd_buffer->state.last_drawid = -1;
  2319. +
  2320. + radv_after_draw(cmd_buffer);
  2321. + return;
  2322. +}
  2323. +
  2324. struct radv_dispatch_info {
  2325. /**
  2326. * Determine the layout of the grid (in block units) to be used.
  2327. --
  2328. 2.36.1
  2329.  
  2330. From 3fd6f0403afebde6cf27c7f3fadbfa795853890f Mon Sep 17 00:00:00 2001
  2331. From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
  2332. Date: Fri, 7 Jan 2022 12:02:11 +0100
  2333. Subject: [PATCH 12/12] radv: Expose VK_NV_device_generated_commands.
  2334.  
  2335. ---
  2336. src/amd/vulkan/radv_device.c | 25 +++++++++++++++++++++++++
  2337. 1 file changed, 25 insertions(+)
  2338.  
  2339. diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
  2340. index 378d82c5765..e9e81a8ee0a 100644
  2341. --- a/src/amd/vulkan/radv_device.c
  2342. +++ b/src/amd/vulkan/radv_device.c
  2343. @@ -576,6 +576,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
  2344. .GOOGLE_user_type = true,
  2345. .INTEL_shader_integer_functions2 = true,
  2346. .NV_compute_shader_derivatives = true,
  2347. + .NV_device_generated_commands = true,
  2348. .NV_mesh_shader = device->use_ngg && device->rad_info.gfx_level >= GFX10_3 &&
  2349. device->instance->perftest_flags & RADV_PERFTEST_NV_MS && !device->use_llvm,
  2350. /* Undocumented extension purely for vkd3d-proton. This check is to prevent anyone else from
  2351. @@ -1792,6 +1793,12 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
  2352. features->borderColorSwizzleFromImage = true;
  2353. break;
  2354. }
  2355. + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV: {
  2356. + VkPhysicalDeviceDeviceGeneratedCommandsFeaturesNV *features =
  2357. + (VkPhysicalDeviceDeviceGeneratedCommandsFeaturesNV *)ext;
  2358. + features->deviceGeneratedCommands = true;
  2359. + break;
  2360. + }
  2361. default:
  2362. break;
  2363. }
  2364. @@ -2476,6 +2483,24 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
  2365.  
  2366. break;
  2367. }
  2368. + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_PROPERTIES_NV: {
  2369. + VkPhysicalDeviceDeviceGeneratedCommandsPropertiesNV *properties =
  2370. + (VkPhysicalDeviceDeviceGeneratedCommandsPropertiesNV *)ext;
  2371. + properties->maxIndirectCommandsStreamCount = 1;
  2372. + properties->maxIndirectCommandsStreamStride = UINT32_MAX;
  2373. + properties->maxIndirectCommandsTokenCount = UINT32_MAX;
  2374. + properties->maxIndirectCommandsTokenOffset = UINT16_MAX;
  2375. + properties->minIndirectCommandsBufferOffsetAlignment = 4;
  2376. + properties->minSequencesCountBufferOffsetAlignment = 4;
  2377. + properties->minSequencesIndexBufferOffsetAlignment = 4;
  2378. +
  2379. + /* Don't support even a shader group count = 1 until we support shader
  2380. + * overrides during pipeline creation. */
  2381. + properties->maxGraphicsShaderGroupCount = 0;
  2382. +
  2383. + properties->maxIndirectSequenceCount = UINT32_MAX;
  2384. + break;
  2385. + }
  2386. default:
  2387. break;
  2388. }
  2389. --
  2390. 2.36.1
  2391.  
  2392. From 2a22b602e05ec29b1ff5e8f9376e84df18794c52 Mon Sep 17 00:00:00 2001
  2393. From: Hans-Kristian Arntzen <post@arntzen-software.no>
  2394. Date: Fri, 10 Jun 2022 15:59:07 +0200
  2395. Subject: [PATCH] radv: Flush SMEM/VMEM for indirects.
  2396.  
  2397. Needed since DGC shaders use shader reads.
  2398.  
  2399. Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
  2400. ---
  2401. src/amd/vulkan/radv_cmd_buffer.c | 5 ++---
  2402. 1 file changed, 2 insertions(+), 3 deletions(-)
  2403.  
  2404. diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
  2405. index be077eac52e..99e75de162f 100644
  2406. --- a/src/amd/vulkan/radv_cmd_buffer.c
  2407. +++ b/src/amd/vulkan/radv_cmd_buffer.c
  2408. @@ -4116,7 +4116,8 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_fla
  2409. case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
  2410. /* SCACHE potentially for reading the dispatch size from the shader. The
  2411. * rest is for the DGC shader input. */
  2412. - flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
  2413. + flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
  2414. + flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
  2415. if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
  2416. flush_bits |= RADV_CMD_FLAG_INV_L2;
  2417. break;
  2418. --
  2419. GitLab
  2420.  
  2421.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement