Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 20f59c593fbaf10192246ccc1f396d3a8af23c20 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Tue, 21 Dec 2021 23:29:58 +0100
- Subject: [PATCH 01/12] radv: Add a 32bit memory type.
- Got to put the commandbuffers & uploadbuffers there. With DGC
- those can be allocated by the application.
- Excluding it from all other buffers/images to avoid using the
- precious 32bit address space.
- ---
- src/amd/vulkan/radv_android.c | 2 +-
- src/amd/vulkan/radv_device.c | 24 ++++++++++++++++++------
- src/amd/vulkan/radv_private.h | 2 ++
- 3 files changed, 21 insertions(+), 7 deletions(-)
- diff --git a/src/amd/vulkan/radv_android.c b/src/amd/vulkan/radv_android.c
- index 03bc702f1e5..5dda8fa4e70 100644
- --- a/src/amd/vulkan/radv_android.c
- +++ b/src/amd/vulkan/radv_android.c
- @@ -146,7 +146,7 @@ radv_image_from_gralloc(VkDevice device_h, const VkImageCreateInfo *base_info,
- for (int i = 0; i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
- bool is_local = !!(device->physical_device->memory_properties.memoryTypes[i].propertyFlags &
- VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
- - if (is_local) {
- + if (is_local && (device->physical_device->memory_types_default & (1u << i))) {
- memory_type_index = i;
- break;
- }
- diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
- index e3df1f7d27b..86a8fa9eb35 100644
- --- a/src/amd/vulkan/radv_device.c
- +++ b/src/amd/vulkan/radv_device.c
- @@ -226,6 +226,13 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device)
- .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
- .heapIndex = vram_index >= 0 ? vram_index : visible_vram_index,
- };
- +
- + device->memory_domains[type_count] = RADEON_DOMAIN_VRAM;
- + device->memory_flags[type_count] = RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_32BIT;
- + device->memory_properties.memoryTypes[type_count++] = (VkMemoryType){
- + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
- + .heapIndex = vram_index >= 0 ? vram_index : visible_vram_index,
- + };
- }
- if (gart_index >= 0) {
- @@ -263,9 +270,9 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device)
- for (int i = 0; i < device->memory_properties.memoryTypeCount; i++) {
- VkMemoryType mem_type = device->memory_properties.memoryTypes[i];
- - if ((mem_type.propertyFlags &
- + if (((mem_type.propertyFlags &
- (VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) ||
- - mem_type.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
- + mem_type.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) && !(device->memory_flags[i] & RADEON_FLAG_32BIT)) {
- VkMemoryPropertyFlags property_flags = mem_type.propertyFlags |
- VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD |
- @@ -281,6 +288,13 @@ radv_physical_device_init_mem_types(struct radv_physical_device *device)
- }
- device->memory_properties.memoryTypeCount = type_count;
- }
- +
- + for (unsigned i = 0; i < type_count; ++i) {
- + if (device->memory_flags[i] & RADEON_FLAG_32BIT)
- + device->memory_types_32bit |= 1u << i;
- + else
- + device->memory_types_default |= 1u << i;
- + }
- }
- static const char *
- @@ -5252,8 +5266,7 @@ radv_get_buffer_memory_requirements(struct radv_device *device, VkDeviceSize siz
- VkBufferCreateFlags flags, VkBufferCreateFlags usage,
- VkMemoryRequirements2 *pMemoryRequirements)
- {
- - pMemoryRequirements->memoryRequirements.memoryTypeBits =
- - (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
- + pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_default;
- if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
- pMemoryRequirements->memoryRequirements.alignment = 4096;
- @@ -5315,8 +5328,7 @@ radv_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequiremen
- RADV_FROM_HANDLE(radv_device, device, _device);
- RADV_FROM_HANDLE(radv_image, image, pInfo->image);
- - pMemoryRequirements->memoryRequirements.memoryTypeBits =
- - (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
- + pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_default;
- pMemoryRequirements->memoryRequirements.size = image->size;
- pMemoryRequirements->memoryRequirements.alignment = image->alignment;
- diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
- index 42ba7ba0eaa..d2aa39dfe30 100644
- --- a/src/amd/vulkan/radv_private.h
- +++ b/src/amd/vulkan/radv_private.h
- @@ -315,6 +315,8 @@ struct radv_physical_device {
- enum radeon_bo_domain memory_domains[VK_MAX_MEMORY_TYPES];
- enum radeon_bo_flag memory_flags[VK_MAX_MEMORY_TYPES];
- unsigned heaps;
- + uint32_t memory_types_default;
- + uint32_t memory_types_32bit;
- #ifndef _WIN32
- int available_nodes;
- --
- 2.36.1
- From b797679fd8399a566bb625d59254a4d40a9cc795 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Sun, 30 Jan 2022 01:54:12 +0100
- Subject: [PATCH 02/12] Skip setting empty index buffers to avoid hang
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 12 +++++++-----
- 1 file changed, 7 insertions(+), 5 deletions(-)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index 064da750784..a9f2970172f 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -2672,12 +2672,14 @@ radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
- if (!indirect)
- return;
- - radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
- - radeon_emit(cs, state->index_va);
- - radeon_emit(cs, state->index_va >> 32);
- + if (state->max_index_count) {
- + radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
- + radeon_emit(cs, state->index_va);
- + radeon_emit(cs, state->index_va >> 32);
- - radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
- - radeon_emit(cs, state->max_index_count);
- + radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
- + radeon_emit(cs, state->max_index_count);
- + }
- cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
- }
- --
- 2.36.1
- From 7aad4d28badd3a9ad8050876d4e0f4f0f60258f0 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Sat, 1 Jan 2022 23:38:38 +0100
- Subject: [PATCH 03/12] radv: Expose function to write vertex descriptors for
- dgc.
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 251 ++++++++++++++++---------------
- src/amd/vulkan/radv_private.h | 2 +
- 2 files changed, 132 insertions(+), 121 deletions(-)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index a9f2970172f..30abd5c86be 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -3405,153 +3405,162 @@ static const uint32_t data_format_dst_sel[] = {
- [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
- };
- -static void
- -radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
- +void
- +write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
- + const struct radv_graphics_pipeline *pipeline, void *vb_ptr)
- {
- - if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
- - cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
- - /* Mesh shaders don't have vertex descriptors. */
- - assert(!cmd_buffer->state.mesh_shading);
- + struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
- + enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
- + unsigned desc_index = 0;
- + uint32_t mask = pipeline->vb_desc_usage_mask;
- + uint64_t va;
- + const struct radv_vs_input_state *vs_state =
- + vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
- + assert(!vs_state || pipeline->use_per_attribute_vb_descs);
- - struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
- - struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
- - enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
- - unsigned vb_offset;
- - void *vb_ptr;
- - unsigned desc_index = 0;
- - uint32_t mask = pipeline->vb_desc_usage_mask;
- - uint64_t va;
- - const struct radv_vs_input_state *vs_state =
- - vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
- + while (mask) {
- + unsigned i = u_bit_scan(&mask);
- + uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
- + uint32_t offset, rsrc_word3;
- + unsigned binding =
- + vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
- + : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
- + struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
- + unsigned num_records;
- + unsigned stride;
- +
- + if (vs_state) {
- + unsigned format = vs_state->formats[i];
- + unsigned dfmt = format & 0xf;
- + unsigned nfmt = (format >> 4) & 0x7;
- +
- + rsrc_word3 =
- + vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
- +
- + if (chip >= GFX10)
- + rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
- + else
- + rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
- + } else {
- + if (chip >= GFX10)
- + rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
- + else
- + rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- + }
- - /* allocate some descriptor state for vertex buffers */
- - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
- - return;
- + if (!buffer) {
- + if (vs_state) {
- + /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
- + * to include the format/word3 so that the alpha channel is 1 for formats without an
- + * alpha channel.
- + */
- + desc[0] = 0;
- + desc[1] = S_008F04_STRIDE(16);
- + desc[2] = 0;
- + desc[3] = rsrc_word3;
- + } else {
- + memset(desc, 0, 4 * 4);
- + }
- + continue;
- + }
- - assert(!vs_state || pipeline->use_per_attribute_vb_descs);
- + va = radv_buffer_get_va(buffer->bo);
- - while (mask) {
- - unsigned i = u_bit_scan(&mask);
- - uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
- - uint32_t offset, rsrc_word3;
- - unsigned binding =
- - vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
- - : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
- - struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
- - unsigned num_records;
- - unsigned stride;
- + offset = cmd_buffer->vertex_bindings[binding].offset;
- + va += offset + buffer->offset;
- + if (vs_state)
- + va += vs_state->offsets[i];
- - if (vs_state) {
- - unsigned format = vs_state->formats[i];
- - unsigned dfmt = format & 0xf;
- - unsigned nfmt = (format >> 4) & 0x7;
- + if (cmd_buffer->vertex_bindings[binding].size) {
- + num_records = cmd_buffer->vertex_bindings[binding].size;
- + } else {
- + num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
- + }
- - rsrc_word3 =
- - vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
- + if (pipeline->uses_dynamic_stride) {
- + stride = cmd_buffer->vertex_bindings[binding].stride;
- + } else {
- + stride = pipeline->binding_stride[binding];
- + }
- - if (chip >= GFX10)
- - rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
- - else
- - rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
- + if (pipeline->use_per_attribute_vb_descs) {
- + uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
- + : pipeline->attrib_ends[i];
- +
- + if (num_records < attrib_end) {
- + num_records = 0; /* not enough space for one vertex */
- + } else if (stride == 0) {
- + num_records = 1; /* only one vertex */
- } else {
- - if (chip >= GFX10)
- - rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
- - else
- - rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- + num_records = (num_records - attrib_end) / stride + 1;
- + /* If attrib_offset>stride, then the compiler will increase the vertex index by
- + * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
- + * only allowed with static strides.
- + */
- + num_records += pipeline->attrib_index_offset[i];
- }
- - if (!buffer) {
- + /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
- + * into bytes in that case. GFX8 always uses bytes.
- + */
- + if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
- + num_records = (num_records - 1) * stride + attrib_end;
- + } else if (!num_records) {
- + /* On GFX9, it seems bounds checking is disabled if both
- + * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
- + * GFX10.3 but it doesn't hurt.
- + */
- if (vs_state) {
- - /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
- - * to include the format/word3 so that the alpha channel is 1 for formats without an
- - * alpha channel.
- - */
- desc[0] = 0;
- desc[1] = S_008F04_STRIDE(16);
- desc[2] = 0;
- desc[3] = rsrc_word3;
- } else {
- - memset(desc, 0, 4 * 4);
- + memset(desc, 0, 16);
- }
- continue;
- }
- + } else {
- + if (chip != GFX8 && stride)
- + num_records = DIV_ROUND_UP(num_records, stride);
- + }
- - va = radv_buffer_get_va(buffer->bo);
- -
- - offset = cmd_buffer->vertex_bindings[binding].offset;
- - va += offset + buffer->offset;
- - if (vs_state)
- - va += vs_state->offsets[i];
- -
- - if (cmd_buffer->vertex_bindings[binding].size) {
- - num_records = cmd_buffer->vertex_bindings[binding].size;
- - } else {
- - num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
- - }
- -
- - if (pipeline->uses_dynamic_stride) {
- - stride = cmd_buffer->vertex_bindings[binding].stride;
- - } else {
- - stride = pipeline->binding_stride[binding];
- - }
- + if (chip >= GFX10) {
- + /* OOB_SELECT chooses the out-of-bounds check:
- + * - 1: index >= NUM_RECORDS (Structured)
- + * - 3: offset >= NUM_RECORDS (Raw)
- + */
- + int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
- + rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
- + }
- - if (pipeline->use_per_attribute_vb_descs) {
- - uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
- - : pipeline->attrib_ends[i];
- + desc[0] = va;
- + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
- + desc[2] = num_records;
- + desc[3] = rsrc_word3;
- + }
- +}
- - if (num_records < attrib_end) {
- - num_records = 0; /* not enough space for one vertex */
- - } else if (stride == 0) {
- - num_records = 1; /* only one vertex */
- - } else {
- - num_records = (num_records - attrib_end) / stride + 1;
- - /* If attrib_offset>stride, then the compiler will increase the vertex index by
- - * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
- - * only allowed with static strides.
- - */
- - num_records += pipeline->attrib_index_offset[i];
- - }
- +static void
- +radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
- +{
- + if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
- + cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
- + /* Mesh shaders don't have vertex descriptors. */
- + assert(!cmd_buffer->state.mesh_shading);
- - /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
- - * into bytes in that case. GFX8 always uses bytes.
- - */
- - if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
- - num_records = (num_records - 1) * stride + attrib_end;
- - } else if (!num_records) {
- - /* On GFX9, it seems bounds checking is disabled if both
- - * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
- - * GFX10.3 but it doesn't hurt.
- - */
- - if (vs_state) {
- - desc[0] = 0;
- - desc[1] = S_008F04_STRIDE(16);
- - desc[2] = 0;
- - desc[3] = rsrc_word3;
- - } else {
- - memset(desc, 0, 16);
- - }
- - continue;
- - }
- - } else {
- - if (chip != GFX8 && stride)
- - num_records = DIV_ROUND_UP(num_records, stride);
- - }
- + struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
- + unsigned vb_offset;
- + void *vb_ptr;
- + uint64_t va;
- - if (chip >= GFX10) {
- - /* OOB_SELECT chooses the out-of-bounds check:
- - * - 1: index >= NUM_RECORDS (Structured)
- - * - 3: offset >= NUM_RECORDS (Raw)
- - */
- - int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
- - rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
- - }
- + /* allocate some descriptor state for vertex buffers */
- + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset,
- + &vb_ptr))
- + return;
- - desc[0] = va;
- - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
- - desc[2] = num_records;
- - desc[3] = rsrc_word3;
- - }
- + write_vertex_descriptors(cmd_buffer, pipeline, vb_ptr);
- va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
- va += vb_offset;
- diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
- index d2aa39dfe30..8d3b1eb1afb 100644
- --- a/src/amd/vulkan/radv_private.h
- +++ b/src/amd/vulkan/radv_private.h
- @@ -1656,6 +1656,8 @@ void radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
- const struct radv_subpass *subpass);
- bool radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size,
- const void *data, unsigned *out_offset);
- +void write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
- + const struct radv_graphics_pipeline *pipeline, void *vb_ptr);
- void radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer *cmd_buffer);
- void radv_cmd_buffer_resolve_subpass(struct radv_cmd_buffer *cmd_buffer);
- --
- 2.36.1
- From fd645676eaed6b9528ba7408d5c20aeaed4295ad Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Mon, 7 Feb 2022 02:04:56 +0100
- Subject: [PATCH 04/12] radv: Always store stride in the vbo descriptor.
- So we can use it in the DGC shader.
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 24 ++++++++----------------
- 1 file changed, 8 insertions(+), 16 deletions(-)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index 30abd5c86be..5a64749c625 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -3449,19 +3449,21 @@ write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- }
- + if (pipeline->uses_dynamic_stride) {
- + stride = cmd_buffer->vertex_bindings[binding].stride;
- + } else {
- + stride = pipeline->binding_stride[binding];
- + }
- +
- if (!buffer) {
- - if (vs_state) {
- /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
- * to include the format/word3 so that the alpha channel is 1 for formats without an
- * alpha channel.
- */
- desc[0] = 0;
- - desc[1] = S_008F04_STRIDE(16);
- + desc[1] = S_008F04_STRIDE(stride);
- desc[2] = 0;
- desc[3] = rsrc_word3;
- - } else {
- - memset(desc, 0, 4 * 4);
- - }
- continue;
- }
- @@ -3478,12 +3480,6 @@ write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
- num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
- }
- - if (pipeline->uses_dynamic_stride) {
- - stride = cmd_buffer->vertex_bindings[binding].stride;
- - } else {
- - stride = pipeline->binding_stride[binding];
- - }
- -
- if (pipeline->use_per_attribute_vb_descs) {
- uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
- : pipeline->attrib_ends[i];
- @@ -3511,14 +3507,10 @@ write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
- * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
- * GFX10.3 but it doesn't hurt.
- */
- - if (vs_state) {
- desc[0] = 0;
- - desc[1] = S_008F04_STRIDE(16);
- + desc[1] = S_008F04_STRIDE(stride);
- desc[2] = 0;
- desc[3] = rsrc_word3;
- - } else {
- - memset(desc, 0, 16);
- - }
- continue;
- }
- } else {
- --
- 2.36.1
- From 8c5047236608abf1f352a39d81d99212bf13b2eb Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Mon, 7 Feb 2022 02:08:51 +0100
- Subject: [PATCH 05/12] radv: Require 32bit memory for indirect buffers.
- ---
- src/amd/vulkan/radv_device.c | 3 +++
- 1 file changed, 3 insertions(+)
- diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
- index 86a8fa9eb35..378d82c5765 100644
- --- a/src/amd/vulkan/radv_device.c
- +++ b/src/amd/vulkan/radv_device.c
- @@ -5267,6 +5267,9 @@ radv_get_buffer_memory_requirements(struct radv_device *device, VkDeviceSize siz
- VkMemoryRequirements2 *pMemoryRequirements)
- {
- pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_default;
- + if (usage & VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT)
- + pMemoryRequirements->memoryRequirements.memoryTypeBits |=
- + device->physical_device->memory_types_32bit;
- if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
- pMemoryRequirements->memoryRequirements.alignment = 4096;
- --
- 2.36.1
- From 82c2fd5183e6aeb8829905456a0080d58965f7c4 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Tue, 28 Jun 2022 00:37:02 +0200
- Subject: [PATCH 06/12] radv: Expose helper for base pa_su_sc_mode_cntl.
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 12 +++++++++---
- src/amd/vulkan/radv_private.h | 1 +
- 2 files changed, 10 insertions(+), 3 deletions(-)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index 5a64749c625..4569788997c 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -1558,11 +1558,10 @@ radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
- S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
- }
- -static void
- -radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
- +uint32_t radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
- {
- unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
- - struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
- + const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
- pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
- C_028814_CULL_BACK &
- @@ -1577,6 +1576,13 @@ radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
- S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
- S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
- S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
- + return pa_su_sc_mode_cntl;
- +}
- +
- +static void
- +radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
- +{
- + unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
- radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
- }
- diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
- index 8d3b1eb1afb..9a6790090dc 100644
- --- a/src/amd/vulkan/radv_private.h
- +++ b/src/amd/vulkan/radv_private.h
- @@ -1643,6 +1643,7 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uin
- void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
- void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries);
- +uint32_t radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer);
- unsigned radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs);
- uint32_t radv_hash_vs_prolog(const void *key_);
- --
- 2.36.1
- From 66539189d78eff85ba6a339f92fc6b8c4c976997 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Tue, 28 Jun 2022 00:14:49 +0200
- Subject: [PATCH 07/12] radv: Add flushing for DGC.
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 8 +++++---
- 1 file changed, 5 insertions(+), 3 deletions(-)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index 4569788997c..eb68eaa53d1 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -4109,9 +4109,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_fla
- {
- switch ((VkAccessFlags2)(1 << b)) {
- case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
- - /* SMEM loads are used to read compute dispatch size in shaders */
- - if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
- - flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
- + /* SCACHE potentially for reading the dispatch size from the shader. The
- + * rest is for the DGC shader input. */
- + flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
- + if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
- + flush_bits |= RADV_CMD_FLAG_INV_L2;
- break;
- case VK_ACCESS_2_INDEX_READ_BIT:
- case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
- --
- 2.36.1
- From e561ef7c88923b0d5671666cdcb85a15f13f90fa Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Mon, 7 Feb 2022 03:28:01 +0100
- Subject: [PATCH 08/12] radv: Add DGC meta shader.
- This generated the cmd and upload buffers.
- ---
- src/amd/vulkan/meson.build | 1 +
- .../vulkan/radv_device_generated_commands.c | 892 ++++++++++++++++++
- src/amd/vulkan/radv_meta.c | 7 +
- src/amd/vulkan/radv_meta.h | 3 +
- src/amd/vulkan/radv_private.h | 6 +
- 5 files changed, 909 insertions(+)
- create mode 100644 src/amd/vulkan/radv_device_generated_commands.c
- diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
- index 75f0685a77a..ebe7cb087b4 100644
- --- a/src/amd/vulkan/meson.build
- +++ b/src/amd/vulkan/meson.build
- @@ -49,6 +49,7 @@ libradv_files = files(
- 'radv_device.c',
- 'radv_descriptor_set.c',
- 'radv_descriptor_set.h',
- + 'radv_device_generated_commands.c',
- 'radv_formats.c',
- 'radv_image.c',
- 'radv_meta.c',
- diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c
- new file mode 100644
- index 00000000000..68d8e4d5060
- --- /dev/null
- +++ b/src/amd/vulkan/radv_device_generated_commands.c
- @@ -0,0 +1,892 @@
- +/*
- + * Copyright © 2021 Google
- + *
- + * Permission is hereby granted, free of charge, to any person obtaining a
- + * copy of this software and associated documentation files (the "Software"),
- + * to deal in the Software without restriction, including without limitation
- + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- + * and/or sell copies of the Software, and to permit persons to whom the
- + * Software is furnished to do so, subject to the following conditions:
- + *
- + * The above copyright notice and this permission notice (including the next
- + * paragraph) shall be included in all copies or substantial portions of the
- + * Software.
- + *
- + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- + * IN THE SOFTWARE.
- + */
- +
- +#include "radv_meta.h"
- +#include "radv_private.h"
- +
- +#include "nir_builder.h"
- +
- +enum radv_dgc_token_type {
- + RADV_DGC_INDEX_BUFFER,
- + RADV_DGC_DRAW,
- + RADV_DGC_INDEXED_DRAW,
- +};
- +
- +struct radv_dgc_token {
- + uint16_t type; /* enum radv_dgc_token_type, but making the size explicit */
- + uint16_t offset; /* offset in the input stream */
- + union {
- + struct {
- + uint16_t vtx_base_sgpr;
- + } draw;
- + struct {
- + uint16_t index_size;
- + uint16_t vtx_base_sgpr;
- + uint32_t max_index_count;
- + } indexed_draw;
- + };
- +};
- +
- +struct radv_dgc_params {
- + uint32_t cmd_buf_stride;
- + uint32_t cmd_buf_size;
- + uint32_t upload_stride;
- + uint32_t upload_addr;
- + uint32_t sequence_count;
- + uint32_t stream_stride;
- +
- + /* draw info */
- + uint16_t draw_indexed;
- + uint16_t draw_params_offset;
- + uint16_t base_index_size;
- + uint16_t vtx_base_sgpr;
- + uint32_t max_index_count;
- +
- + /* bind index buffer info. Valid if base_index_size == 0 && draw_indexed */
- + uint16_t index_buffer_offset;
- +
- + uint8_t vbo_cnt;
- + uint8_t const_copy;
- +
- + /* Which VBOs are set in this indirect layout. */
- + uint32_t vbo_bind_mask;
- +
- + uint16_t vbo_reg;
- + uint16_t const_copy_size;
- +
- + uint64_t push_constant_mask;
- +
- + uint32_t ibo_type_32;
- + uint32_t ibo_type_8;
- +
- + uint16_t push_constant_shader_cnt;
- +
- + uint16_t emit_state;
- + uint32_t pa_su_sc_mode_cntl_base;
- + uint16_t state_offset;
- + uint16_t scissor_count;
- + uint16_t scissor_offset; /* in parameter buffer. */
- +};
- +
- +#define load_param32(b, field) \
- + nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
- + .base = offsetof(struct radv_dgc_params, field), .range = 4)
- +
- +#define load_param16(b, field) \
- + nir_ubfe( \
- + (b), \
- + nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
- + .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
- + nir_imm_int((b), (offsetof(struct radv_dgc_params, field) & 2) * 8), nir_imm_int((b), 16))
- +
- +#define load_param8(b, field) \
- + nir_ubfe( \
- + (b), \
- + nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
- + .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
- + nir_imm_int((b), (offsetof(struct radv_dgc_params, field) & 3) * 8), nir_imm_int((b), 8))
- +
- +#define load_param64(b, field) \
- + nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \
- + .base = offsetof(struct radv_dgc_params, field), .range = 8))
- +
- +static nir_ssa_def *
- +nir_pkt3(nir_builder *b, unsigned op, nir_ssa_def *len)
- +{
- + len = nir_iand_imm(b, len, 0x3fff);
- + return nir_ior(b, nir_imm_int(b, PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op)),
- + nir_ishl(b, len, nir_imm_int(b, 16)));
- +}
- +
- +static nir_ssa_def *
- +dgc_emit_userdata_vertex(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *vtx_base_sgpr,
- + nir_ssa_def *first_vertex, nir_ssa_def *first_instance, nir_ssa_def *drawid)
- +{
- + vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
- + nir_ssa_def *has_drawid =
- + nir_ine(b, nir_iand_imm(b, vtx_base_sgpr, 1u << 14), nir_imm_int(b, 0));
- + nir_ssa_def *has_baseinstance =
- + nir_ine(b, nir_iand_imm(b, vtx_base_sgpr, 1u << 15), nir_imm_int(b, 0));
- +
- + nir_ssa_def *pkt_cnt = nir_imm_int(b, 1);
- + pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
- + pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
- +
- + nir_ssa_def *values[5] = {
- + nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), first_vertex,
- + nir_imm_int(b, PKT3_NOP_PAD), nir_imm_int(b, PKT3_NOP_PAD),
- + };
- +
- + values[3] = nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance),
- + nir_bcsel(b, has_drawid, drawid, first_instance), values[4]);
- + values[4] = nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, values[4]);
- +
- + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
- +
- + nir_store_ssbo(b, nir_vec(b, values, 4), dst_buf, offset, .write_mask = 0xf,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_ssbo(b, nir_vec(b, values + 4, 1), dst_buf, nir_iadd_imm(b, offset, 16),
- + .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
- + return nir_iadd_imm(b, offset, 20);
- +}
- +
- +static nir_ssa_def *
- +dgc_emit_instance_count(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *instance_count)
- +{
- + nir_ssa_def *values[2] = {nir_imm_int(b, PKT3(PKT3_NUM_INSTANCES, 0, false)), instance_count};
- +
- + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
- +
- + nir_store_ssbo(b, nir_vec(b, values, 2), dst_buf, offset, .write_mask = 0x3,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + return nir_iadd_imm(b, offset, 8);
- +}
- +
- +static nir_ssa_def *
- +dgc_emit_draw_indexed(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index_offset,
- + nir_ssa_def *index_count, nir_ssa_def *max_index_count)
- +{
- + nir_ssa_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, false)),
- + max_index_count, index_offset, index_count,
- + nir_imm_int(b, V_0287F0_DI_SRC_SEL_DMA)};
- +
- + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
- +
- + nir_store_ssbo(b, nir_vec(b, values, 4), dst_buf, offset, .write_mask = 0xf,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_ssbo(b, nir_vec(b, values + 4, 1), dst_buf, nir_iadd_imm(b, offset, 16),
- + .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
- + return nir_iadd_imm(b, offset, 20);
- +}
- +
- +static nir_ssa_def *
- +dgc_emit_draw(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *vertex_count)
- +{
- + nir_ssa_def *values[3] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_AUTO, 1, false)), vertex_count,
- + nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX)};
- +
- + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
- +
- + nir_store_ssbo(b, nir_vec(b, values, 3), dst_buf, offset, .write_mask = 0x7,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + return nir_iadd_imm(b, offset, 12);
- +}
- +
- +static void
- +build_dgc_buffer_tail(nir_builder *b, nir_ssa_def *sequence_count)
- +{
- + nir_ssa_def *global_id = get_global_ids(b, 1);
- +
- + nir_ssa_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
- + nir_ssa_def *cmd_buf_size = load_param32(b, cmd_buf_size);
- +
- + nir_push_if(b, nir_ieq(b, global_id, nir_imm_int(b, 0)));
- + {
- + nir_ssa_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);
- +
- + nir_variable *offset =
- + nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
- + nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
- +
- + nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, 2);
- + nir_push_loop(b);
- + {
- + nir_ssa_def *curr_offset = nir_load_var(b, offset);
- +
- + nir_push_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
- + {
- + nir_jump(b, nir_jump_break);
- + }
- + nir_pop_if(b, NULL);
- +
- + nir_ssa_def *packet_size = nir_isub(b, cmd_buf_size, curr_offset);
- + packet_size = nir_umin(b, packet_size, nir_imm_int(b, 0x3ffc * 4));
- +
- + nir_ssa_def *len = nir_ushr_imm(b, packet_size, 2);
- + len = nir_iadd_imm(b, len, -2);
- + nir_ssa_def *packet = nir_pkt3(b, PKT3_NOP, len);
- +
- + nir_store_ssbo(b, packet, dst_buf, curr_offset, .write_mask = 0x1,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
- + }
- + nir_pop_loop(b, NULL);
- + }
- + nir_pop_if(b, NULL);
- +}
- +
- +static nir_shader *
- +build_dgc_prepare_shader(struct radv_device *dev)
- +{
- + nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
- + b.shader->info.workgroup_size[0] = 64;
- +
- + nir_ssa_def *global_id = get_global_ids(&b, 1);
- +
- + nir_ssa_def *sequence_id = global_id;
- +
- + nir_ssa_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
- + nir_ssa_def *sequence_count = load_param32(&b, sequence_count);
- + nir_ssa_def *stream_stride = load_param32(&b, stream_stride);
- +
- + nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
- + nir_store_var(&b, count_var, sequence_count, 0x1);
- +
- + nir_push_if(&b, nir_ieq_imm(&b, sequence_count, UINT32_MAX));
- + {
- + nir_ssa_def *count_buf = radv_meta_load_descriptor(&b, 0, 4);
- + nir_ssa_def *cnt = nir_load_ssbo(&b, 1, 32, count_buf, nir_imm_int(&b, 0), .align_mul = 4);
- + nir_store_var(&b, count_var, cnt, 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- + sequence_count = nir_load_var(&b, count_var);
- +
- + nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
- + {
- + nir_variable *cmd_buf_offset =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset");
- + nir_store_var(&b, cmd_buf_offset, nir_imul(&b, global_id, cmd_buf_stride), 1);
- + nir_ssa_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf_offset), cmd_buf_stride);
- +
- + nir_ssa_def *stream_buf = radv_meta_load_descriptor(&b, 0, 1);
- + nir_ssa_def *stream_base = nir_imul(&b, sequence_id, stream_stride);
- +
- + nir_variable *upload_offset =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset");
- + nir_store_var(&b, upload_offset,
- + nir_iadd(&b, load_param32(&b, cmd_buf_size),
- + nir_imul(&b, load_param32(&b, upload_stride), sequence_id)),
- + 0x1);
- +
- + nir_ssa_def *vbo_bind_mask = load_param32(&b, vbo_bind_mask);
- + nir_ssa_def *vbo_cnt = load_param8(&b, vbo_cnt);
- + nir_push_if(&b, nir_ine(&b, vbo_bind_mask, nir_imm_int(&b, 0)));
- + {
- + nir_variable *vbo_idx =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
- + nir_store_var(&b, vbo_idx, nir_imm_int(&b, 0), 0x1);
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- +
- + nir_push_loop(&b);
- + {
- + nir_push_if(&b, nir_uge(&b, nir_load_var(&b, vbo_idx), vbo_cnt));
- + {
- + nir_jump(&b, nir_jump_break);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *vbo_offset = nir_imul(&b, nir_load_var(&b, vbo_idx), nir_imm_int(&b, 16));
- + nir_variable *vbo_data =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");
- +
- + nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, 3);
- + nir_store_var(&b, vbo_data,
- + nir_load_ssbo(&b, 4, 32, param_buf, vbo_offset, .align_mul = 4), 0xf);
- +
- + nir_ssa_def *vbo_override =
- + nir_ine(&b,
- + nir_iand(&b, vbo_bind_mask,
- + nir_ishl(&b, nir_imm_int(&b, 1), nir_load_var(&b, vbo_idx))),
- + nir_imm_int(&b, 0));
- + nir_push_if(&b, vbo_override);
- + {
- + nir_ssa_def *vbo_offset_offset =
- + nir_iadd(&b, nir_imul(&b, vbo_cnt, nir_imm_int(&b, 16)),
- + nir_imul(&b, nir_load_var(&b, vbo_idx), nir_imm_int(&b, 8)));
- + nir_ssa_def *vbo_over_data =
- + nir_load_ssbo(&b, 2, 32, param_buf, vbo_offset_offset, .align_mul = 4);
- + nir_ssa_def *stream_offset = nir_iadd(
- + &b, stream_base, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 0x7FFF));
- + nir_ssa_def *stream_data =
- + nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4);
- +
- + nir_ssa_def *va = nir_pack_64_2x32(&b, nir_channels(&b, stream_data, 0x3));
- + nir_ssa_def *size = nir_channel(&b, stream_data, 2);
- + nir_ssa_def *stride = nir_channel(&b, stream_data, 3);
- +
- + nir_ssa_def *vs_state_offset = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 0), nir_imm_int(&b, 16), nir_imm_int(&b, 15));
- + va = nir_iadd(&b, va, nir_u2u64(&b, vs_state_offset));
- +
- + nir_ssa_def *dyn_stride =
- + nir_ine(&b, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 1u << 15),
- + nir_imm_int(&b, 0));
- + nir_ssa_def *old_stride =
- + nir_ubfe(&b, nir_channel(&b, nir_load_var(&b, vbo_data), 1), nir_imm_int(&b, 16),
- + nir_imm_int(&b, 14));
- + stride = nir_bcsel(&b, dyn_stride, stride, old_stride);
- +
- + nir_ssa_def *use_per_attribute_vb_descs =
- + nir_ine(&b, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 1u << 31),
- + nir_imm_int(&b, 0));
- + nir_variable *num_records = nir_variable_create(b.shader, nir_var_shader_temp,
- + glsl_uint_type(), "num_records");
- + nir_store_var(&b, num_records, size, 0x1);
- +
- + nir_push_if(&b, use_per_attribute_vb_descs);
- + {
- + nir_ssa_def *attrib_end = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 1),
- + nir_imm_int(&b, 16), nir_imm_int(&b, 16));
- + nir_ssa_def *attrib_index_offset =
- + nir_ubfe(&b, nir_channel(&b, vbo_over_data, 1), nir_imm_int(&b, 0),
- + nir_imm_int(&b, 16));
- +
- + nir_push_if(&b, nir_ult(&b, nir_load_var(&b, num_records), attrib_end));
- + {
- + nir_store_var(&b, num_records, nir_imm_int(&b, 0), 0x1);
- + }
- + nir_push_else(&b, NULL);
- + nir_push_if(&b, nir_ieq_imm(&b, stride, 0));
- + {
- + nir_store_var(&b, num_records, nir_imm_int(&b, 1), 0x1);
- + }
- + nir_push_else(&b, NULL);
- + {
- + nir_ssa_def *r = nir_iadd(
- + &b,
- + nir_iadd_imm(
- + &b,
- + nir_udiv(&b, nir_isub(&b, nir_load_var(&b, num_records), attrib_end),
- + stride),
- + 1),
- + attrib_index_offset);
- + nir_store_var(&b, num_records, r, 0x1);
- + }
- + nir_pop_if(&b, NULL);
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *convert_cond =
- + nir_ine(&b, nir_load_var(&b, num_records), nir_imm_int(&b, 0));
- + if (dev->physical_device->rad_info.gfx_level == GFX9)
- + convert_cond = nir_imm_bool(&b, false);
- + else if (dev->physical_device->rad_info.gfx_level != GFX8)
- + convert_cond =
- + nir_iand(&b, convert_cond, nir_ieq_imm(&b, stride, 0));
- +
- + nir_ssa_def *new_records = nir_iadd(
- + &b, nir_imul(&b, nir_iadd_imm(&b, nir_load_var(&b, num_records), -1), stride),
- + attrib_end);
- + new_records =
- + nir_bcsel(&b, convert_cond, new_records, nir_load_var(&b, num_records));
- + nir_store_var(&b, num_records, new_records, 0x1);
- + }
- + nir_push_else(&b, NULL);
- + {
- + if (dev->physical_device->rad_info.gfx_level != GFX8) {
- + nir_push_if(&b, nir_ine(&b, stride, nir_imm_int(&b, 0)));
- + {
- + nir_ssa_def *r = nir_iadd(&b, nir_load_var(&b, num_records),
- + nir_iadd_imm(&b, stride, -1));
- + nir_store_var(&b, num_records, nir_udiv(&b, r, stride), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- + }
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *rsrc_word3 = nir_channel(&b, nir_load_var(&b, vbo_data), 3);
- + if (dev->physical_device->rad_info.gfx_level >= GFX10) {
- + nir_ssa_def *oob_select = nir_bcsel(
- + &b, nir_ieq_imm(&b, stride, 0), nir_imm_int(&b, V_008F0C_OOB_SELECT_RAW),
- + nir_imm_int(&b, V_008F0C_OOB_SELECT_STRUCTURED));
- + rsrc_word3 = nir_iand_imm(&b, rsrc_word3, C_008F0C_OOB_SELECT);
- + rsrc_word3 =
- + nir_ior(&b, rsrc_word3, nir_ishl(&b, oob_select, nir_imm_int(&b, 28)));
- + }
- +
- + nir_ssa_def *va_hi = nir_iand_imm(&b, nir_unpack_64_2x32_split_y(&b, va), 0xFFFF);
- + stride = nir_iand_imm(&b, stride, 0x3FFF);
- + nir_ssa_def *new_vbo_data[4] = {
- + nir_unpack_64_2x32_split_x(&b, va),
- + nir_ior(&b, nir_ishl(&b, stride, nir_imm_int(&b, 16)), va_hi),
- + nir_load_var(&b, num_records), rsrc_word3};
- + nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *upload_off = nir_iadd(&b, nir_load_var(&b, upload_offset), vbo_offset);
- + nir_store_ssbo(&b, nir_load_var(&b, vbo_data), cmd_buf, upload_off, .write_mask = 0xf,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, vbo_idx, nir_iadd_imm(&b, nir_load_var(&b, vbo_idx), 1), 0x1);
- + }
- + nir_pop_loop(&b, NULL);
- + nir_ssa_def *packet[3] = {
- + nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)), load_param16(&b, vbo_reg),
- + nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))};
- +
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- + nir_store_ssbo(&b, nir_vec(&b, packet, 3), cmd_buf, off, .write_mask = 0x7,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 12), 0x1);
- +
- + nir_store_var(&b, upload_offset, nir_iadd(&b, nir_load_var(&b, upload_offset), nir_imul(&b, vbo_cnt, nir_imm_int(&b, 16))), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- +
- + nir_ssa_def *push_const_mask = load_param64(&b, push_constant_mask);
- + nir_push_if(&b, nir_ine(&b, push_const_mask, nir_imm_int64(&b, 0)));
- + {
- + nir_ssa_def *const_copy = nir_ine(&b, load_param8(&b, const_copy), nir_imm_int(&b, 0));
- + nir_ssa_def *const_copy_size = load_param16(&b, const_copy_size);
- + nir_ssa_def *const_copy_words = nir_ushr_imm(&b, const_copy_size, 2);
- + const_copy_words = nir_bcsel(&b, const_copy, const_copy_words, nir_imm_int(&b, 0));
- +
- + nir_variable *idx =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "const_copy_idx");
- + nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1);
- +
- + nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, 3);
- + nir_ssa_def *param_offset = nir_imul(&b, vbo_cnt, nir_imm_int(&b, 24));
- + nir_ssa_def *param_offset_offset = nir_iadd_imm(&b, param_offset, MESA_VULKAN_SHADER_STAGES * 12);
- + nir_ssa_def *param_const_offset = nir_iadd_imm(&b, param_offset, MAX_PUSH_CONSTANTS_SIZE + MESA_VULKAN_SHADER_STAGES * 12);
- + nir_push_loop(&b);
- + {
- + nir_ssa_def *cur_idx = nir_load_var(&b, idx);
- + nir_push_if(&b, nir_uge(&b, cur_idx, const_copy_words));
- + {
- + nir_jump(&b, nir_jump_break);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
- +
- + nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx));
- + update = nir_bcsel(&b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64)), update, nir_imm_int64(&b, 0));
- +
- + nir_push_if(&b, nir_ine(&b, update, nir_imm_int64(&b, 0)));
- + {
- + nir_ssa_def *stream_offset = nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_offset_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4);
- + nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset), .align_mul = 4);
- + nir_store_var(&b, data, new_data, 0x1);
- + }
- + nir_push_else(&b, NULL);
- + {
- + nir_store_var(&b, data, nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_const_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_store_ssbo(&b, nir_load_var(&b, data), cmd_buf, nir_iadd(&b, nir_load_var(&b, upload_offset), nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
- +
- + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
- + }
- + nir_pop_loop(&b, NULL);
- +
- + nir_variable *shader_idx =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "shader_idx");
- + nir_store_var(&b, shader_idx, nir_imm_int(&b, 0), 0x1);
- + nir_ssa_def *shader_cnt = load_param16(&b, push_constant_shader_cnt);
- +
- + nir_push_loop(&b);
- + {
- + nir_ssa_def *cur_shader_idx = nir_load_var(&b, shader_idx);
- + nir_push_if(&b, nir_uge(&b, cur_shader_idx, shader_cnt));
- + {
- + nir_jump(&b, nir_jump_break);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *reg_info = nir_load_ssbo(&b, 3, 32, param_buf, nir_iadd(&b, param_offset, nir_imul_imm(&b, cur_shader_idx, 12)), .align_mul = 4);
- + nir_ssa_def *upload_sgpr = nir_ubfe(&b, nir_channel(&b, reg_info, 0), nir_imm_int(&b, 0), nir_imm_int(&b, 16));
- + nir_ssa_def *inline_sgpr = nir_ubfe(&b, nir_channel(&b, reg_info, 0), nir_imm_int(&b, 16), nir_imm_int(&b, 16));
- + nir_ssa_def *inline_mask = nir_pack_64_2x32(&b, nir_channels(&b, reg_info, 0x6));
- +
- + nir_push_if(&b, nir_ine(&b, upload_sgpr, nir_imm_int(&b, 0)));
- + {
- + nir_ssa_def *pkt[3] = {
- + nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)),
- + upload_sgpr,
- + nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))
- + };
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- + nir_store_ssbo(&b, nir_vec(&b, pkt, 3), cmd_buf, off, .write_mask = 0x7, .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 12), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_push_if(&b, nir_ine(&b, inline_sgpr, nir_imm_int(&b, 0)));
- + {
- + nir_ssa_def *inline_len = nir_bit_count(&b, inline_mask);
- + nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1);
- +
- + nir_ssa_def *pkt[2] = {
- + nir_pkt3(&b, PKT3_SET_SH_REG, inline_len),
- + inline_sgpr
- + };
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- + nir_store_ssbo(&b, nir_vec(&b, pkt, 2), cmd_buf, off, .write_mask = 0x3, .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 8), 0x1);
- +
- + nir_push_loop(&b);
- + {
- + nir_ssa_def *cur_idx = nir_load_var(&b, idx);
- + nir_push_if(&b, nir_uge(&b, cur_idx, nir_imm_int(&b, 64)));
- + {
- + nir_jump(&b, nir_jump_break);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *l = nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx);
- + nir_push_if(&b,nir_ieq(&b, nir_iand(&b, l, inline_mask), nir_imm_int64(&b, 0)));
- + {
- + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
- + nir_jump(&b, nir_jump_continue);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
- +
- + nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx));
- + update = nir_bcsel(&b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64)), update, nir_imm_int64(&b, 0));
- +
- + nir_push_if(&b, nir_ine(&b, update, nir_imm_int64(&b, 0)));
- + {
- + nir_ssa_def *stream_offset = nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_offset_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4);
- + nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset), .align_mul = 4);
- + nir_store_var(&b, data, new_data, 0x1);
- + }
- + nir_push_else(&b, NULL);
- + {
- + nir_store_var(&b, data, nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_const_offset, nir_ishl(&b, cur_idx, nir_imm_int(&b, 2))), .align_mul = 4), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- + off = nir_load_var(&b, cmd_buf_offset);
- + nir_store_ssbo(&b, nir_load_var(&b, data), cmd_buf, off, .write_mask = 0x1, .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 4), 0x1);
- +
- + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
- + }
- + nir_pop_loop(&b, NULL);
- + }
- + nir_pop_if(&b, NULL);
- + nir_store_var(&b, shader_idx, nir_iadd_imm(&b, cur_shader_idx, 1), 0x1);
- + }
- + nir_pop_loop(&b, NULL);
- + }
- + nir_pop_if(&b, 0);
- +
- + nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, emit_state), 1));
- + {
- + nir_ssa_def *stream_offset = nir_iadd(&b, load_param16(&b, state_offset), stream_base);
- + nir_ssa_def *state = nir_load_ssbo(&b, 1, 32, stream_buf, stream_offset, .align_mul = 4);
- + state = nir_iand_imm(&b, state, 1);
- +
- + nir_ssa_def *reg =
- + nir_ior(&b, load_param32(&b, pa_su_sc_mode_cntl_base), nir_ishl_imm(&b, state, 2));
- +
- + nir_ssa_def *cmd_values[3] = {
- + nir_imm_int(&b, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)),
- + nir_imm_int(&b, (R_028814_PA_SU_SC_MODE_CNTL - SI_CONTEXT_REG_OFFSET) >> 2), reg};
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- +
- + nir_store_ssbo(&b, nir_vec(&b, cmd_values, 3), cmd_buf, off, .write_mask = 0x7,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 0xc), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *scissor_count = load_param16(&b, scissor_count);
- + nir_push_if(&b, nir_ine(&b, scissor_count, nir_imm_int(&b, 0)));
- + {
- + nir_ssa_def *scissor_offset = load_param16(&b, scissor_offset);
- + nir_variable *idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(),
- + "scissor_copy_idx");
- + nir_store_var(&b, idx, nir_imm_int(&b, 0), 1);
- +
- + nir_push_loop(&b);
- + {
- + nir_ssa_def *cur_idx = nir_load_var(&b, idx);
- + nir_push_if(&b, nir_uge(&b, cur_idx, scissor_count));
- + {
- + nir_jump(&b, nir_jump_break);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, 3);
- + nir_ssa_def *param_offset = nir_iadd(&b, scissor_offset, nir_imul_imm(&b, cur_idx, 4));
- + nir_ssa_def *value = nir_load_ssbo(&b, 1, 32, param_buf, param_offset, .align_mul = 4);
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- +
- + nir_store_ssbo(&b, value, cmd_buf, off, .write_mask = 0x1,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 4), 0x1);
- +
- + nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 1);
- + }
- + nir_pop_loop(&b, NULL);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0));
- + {
- + nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr);
- + nir_ssa_def *stream_offset =
- + nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base);
- +
- + nir_ssa_def *draw_data0 =
- + nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4);
- + nir_ssa_def *vertex_count = nir_channel(&b, draw_data0, 0);
- + nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1);
- + nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 2);
- + nir_ssa_def *first_instance = nir_channel(&b, draw_data0, 3);
- +
- + nir_push_if(&b, nir_iand(&b, nir_ine(&b, vertex_count, nir_imm_int(&b, 0)), nir_ine(&b, instance_count, nir_imm_int(&b, 0))));
- + {
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- + off = dgc_emit_userdata_vertex(&b, off, vtx_base_sgpr, vertex_offset, first_instance, sequence_id);
- + off = dgc_emit_instance_count(&b, off, instance_count);
- + off = dgc_emit_draw(&b, off, vertex_count);
- + nir_store_var(&b, cmd_buf_offset, off, 0x1);
- + }
- + nir_pop_if(&b, 0);
- + }
- + nir_push_else(&b, NULL);
- + {
- + nir_variable *index_size_var =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "index_size");
- + nir_store_var(&b, index_size_var, load_param16(&b, base_index_size), 0x1);
- + nir_variable *max_index_count_var =
- + nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
- + nir_store_var(&b, max_index_count_var, load_param32(&b, max_index_count), 0x1);
- +
- + nir_ssa_def *bind_index_buffer = nir_ieq_imm(&b, nir_load_var(&b, index_size_var), 0);
- + nir_push_if(&b, bind_index_buffer);
- + {
- + nir_ssa_def *index_stream_offset =
- + nir_iadd(&b, load_param16(&b, index_buffer_offset), stream_base);
- + nir_ssa_def *data =
- + nir_load_ssbo(&b, 4, 32, stream_buf, index_stream_offset, .align_mul = 4);
- +
- + nir_ssa_def *vk_index_type = nir_channel(&b, data, 3);
- + nir_ssa_def *index_type = nir_bcsel(
- + &b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_32)),
- + nir_imm_int(&b, V_028A7C_VGT_INDEX_32), nir_imm_int(&b, V_028A7C_VGT_INDEX_16));
- + index_type = nir_bcsel(&b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_8)),
- + nir_imm_int(&b, V_028A7C_VGT_INDEX_8), index_type);
- +
- + nir_ssa_def *index_size = nir_iand_imm(
- + &b, nir_ushr(&b, nir_imm_int(&b, 0x142), nir_ishr_imm(&b, index_type, 2)), 0xf);
- + nir_store_var(&b, index_size_var, index_size, 0x1);
- +
- + nir_ssa_def *max_index_count = nir_udiv(&b, nir_channel(&b, data, 2), index_size);
- + nir_store_var(&b, max_index_count_var, max_index_count, 0x1);
- +
- + nir_ssa_def *cmd_values[3 + 2 + 3];
- +
- + if (dev->physical_device->rad_info.gfx_level >= GFX9) {
- + unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
- + if (dev->physical_device->rad_info.gfx_level < GFX9 ||
- + (dev->physical_device->rad_info.gfx_level == GFX9 &&
- + dev->physical_device->rad_info.me_fw_version < 26))
- + opcode = PKT3_SET_UCONFIG_REG;
- + cmd_values[0] = nir_imm_int(&b, PKT3(opcode, 1, 0));
- + cmd_values[1] = nir_imm_int(
- + &b, (R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
- + cmd_values[2] = index_type;
- + } else {
- + cmd_values[0] = nir_imm_int(&b, PKT3(PKT3_INDEX_TYPE, 0, 0));
- + cmd_values[1] = index_type;
- + cmd_values[2] = nir_imm_int(&b, PKT3_NOP_PAD);
- + }
- +
- + nir_ssa_def *addr_upper = nir_channel(&b, data, 1);
- + addr_upper = nir_ishr_imm(&b, nir_ishl(&b, addr_upper, nir_imm_int(&b, 16)), 16);
- +
- + cmd_values[3] = nir_imm_int(&b, PKT3(PKT3_INDEX_BASE, 1, 0));
- + cmd_values[4] = nir_channel(&b, data, 0);
- + cmd_values[5] = addr_upper;
- + cmd_values[6] = nir_imm_int(&b, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
- + cmd_values[7] = max_index_count;
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- + nir_store_ssbo(&b, nir_vec(&b, cmd_values, 4), cmd_buf, off, .write_mask = 0xf,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_ssbo(&b, nir_vec(&b, cmd_values + 4, 4), cmd_buf, nir_iadd_imm(&b, off, 16),
- + .write_mask = 0xf, .access = ACCESS_NON_READABLE, .align_mul = 4);
- + nir_store_var(&b, cmd_buf_offset, nir_iadd_imm(&b, off, 0x20), 0x1);
- + }
- + nir_pop_if(&b, NULL);
- +
- + nir_ssa_def *index_size = nir_load_var(&b, index_size_var);
- + nir_ssa_def *max_index_count = nir_load_var(&b, max_index_count_var);
- + nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr);
- + nir_ssa_def *stream_offset =
- + nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base);
- +
- + index_size =
- + nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, index_size_var), index_size);
- + max_index_count = nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, max_index_count_var),
- + max_index_count);
- + nir_ssa_def *draw_data0 =
- + nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4);
- + nir_ssa_def *draw_data1 = nir_load_ssbo(
- + &b, 1, 32, stream_buf, nir_iadd_imm(&b, stream_offset, 16), .align_mul = 4);
- + nir_ssa_def *index_count = nir_channel(&b, draw_data0, 0);
- + nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1);
- + nir_ssa_def *first_index = nir_channel(&b, draw_data0, 2);
- + nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 3);
- + nir_ssa_def *first_instance = nir_channel(&b, draw_data1, 0);
- +
- + nir_push_if(&b, nir_iand(&b, nir_ine(&b, index_count, nir_imm_int(&b, 0)), nir_ine(&b, instance_count, nir_imm_int(&b, 0))));
- + {
- + nir_ssa_def *off = nir_load_var(&b, cmd_buf_offset);
- + off = dgc_emit_userdata_vertex(&b, off, vtx_base_sgpr, vertex_offset, first_instance, sequence_id);
- + off = dgc_emit_instance_count(&b, off, instance_count);
- + off = dgc_emit_draw_indexed(&b, off, first_index, index_count,
- + max_index_count);
- + nir_store_var(&b, cmd_buf_offset, off, 0x1);
- + }
- + nir_pop_if(&b, 0);
- + }
- + nir_pop_if(&b, NULL);
- +
- + /* Pad the cmdbuffer if we did not use the whole stride */
- + nir_push_if(&b, nir_ine(&b, nir_load_var(&b, cmd_buf_offset), cmd_buf_end));
- + {
- + nir_ssa_def *cnt = nir_isub(&b, cmd_buf_end, nir_load_var(&b, cmd_buf_offset));
- + cnt = nir_ushr_imm(&b, cnt, 2);
- + cnt = nir_iadd_imm(&b, cnt, -2);
- + nir_ssa_def *pkt = nir_pkt3(&b, PKT3_NOP, cnt);
- +
- + nir_ssa_def *cmd_buf = radv_meta_load_descriptor(&b, 0, 2);
- + nir_store_ssbo(&b, pkt, cmd_buf, nir_load_var(&b, cmd_buf_offset), .write_mask = 0x1,
- + .access = ACCESS_NON_READABLE, .align_mul = 4);
- + }
- + nir_pop_if(&b, NULL);
- + }
- + nir_pop_if(&b, NULL);
- +
- + build_dgc_buffer_tail(&b, sequence_count);
- + return b.shader;
- +}
- +
- +void
- +radv_device_finish_dgc_prepare_state(struct radv_device *device)
- +{
- + radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.dgc_prepare.pipeline,
- + &device->meta_state.alloc);
- + radv_DestroyPipelineLayout(radv_device_to_handle(device),
- + device->meta_state.dgc_prepare.p_layout, &device->meta_state.alloc);
- + radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
- + device->meta_state.dgc_prepare.ds_layout,
- + &device->meta_state.alloc);
- +}
- +
- +VkResult
- +radv_device_init_dgc_prepare_state(struct radv_device *device)
- +{
- + VkResult result;
- + nir_shader *cs = build_dgc_prepare_shader(device);
- +
- + VkDescriptorSetLayoutCreateInfo ds_create_info = {
- + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
- + .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
- + .bindingCount = 5,
- + .pBindings = (VkDescriptorSetLayoutBinding[]){
- + {.binding = 0, // index
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .descriptorCount = 1,
- + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
- + .pImmutableSamplers = NULL},
- + {.binding = 1, // token stream
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .descriptorCount = 1,
- + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
- + .pImmutableSamplers = NULL},
- + {.binding = 2, // prepare buffer
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .descriptorCount = 1,
- + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
- + .pImmutableSamplers = NULL},
- + {.binding = 3, // params
- + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
- + .descriptorCount = 1,
- + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
- + .pImmutableSamplers = NULL},
- + {.binding = 4, // count buffer
- + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
- + .descriptorCount = 1,
- + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
- + .pImmutableSamplers = NULL},
- + }};
- +
- + result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &ds_create_info,
- + &device->meta_state.alloc,
- + &device->meta_state.dgc_prepare.ds_layout);
- + if (result != VK_SUCCESS)
- + goto fail;
- +
- + const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
- + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
- + .setLayoutCount = 1,
- + .pSetLayouts = &device->meta_state.dgc_prepare.ds_layout,
- + .pushConstantRangeCount = 1,
- + .pPushConstantRanges =
- + &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct radv_dgc_params)},
- + };
- +
- + result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,
- + &device->meta_state.alloc,
- + &device->meta_state.dgc_prepare.p_layout);
- + if (result != VK_SUCCESS)
- + goto fail;
- +
- + VkPipelineShaderStageCreateInfo shader_stage = {
- + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
- + .stage = VK_SHADER_STAGE_COMPUTE_BIT,
- + .module = vk_shader_module_handle_from_nir(cs),
- + .pName = "main",
- + .pSpecializationInfo = NULL,
- + };
- +
- + VkComputePipelineCreateInfo pipeline_info = {
- + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
- + .stage = shader_stage,
- + .flags = 0,
- + .layout = device->meta_state.dgc_prepare.p_layout,
- + };
- +
- + result = radv_CreateComputePipelines(
- + radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
- + &pipeline_info, &device->meta_state.alloc, &device->meta_state.dgc_prepare.pipeline);
- + if (result != VK_SUCCESS)
- + goto fail;
- +
- + ralloc_free(cs);
- + return VK_SUCCESS;
- +fail:
- + radv_device_finish_dgc_prepare_state(device);
- + ralloc_free(cs);
- + return result;
- +}
- diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c
- index caba21759ab..35bf5ab5bba 100644
- --- a/src/amd/vulkan/radv_meta.c
- +++ b/src/amd/vulkan/radv_meta.c
- @@ -622,10 +622,16 @@ radv_device_init_meta(struct radv_device *device)
- if (result != VK_SUCCESS)
- goto fail_etc_decode;
- + result = radv_device_init_dgc_prepare_state(device);
- + if (result != VK_SUCCESS)
- + goto fail_dgc;
- +
- device->app_shaders_internal = false;
- return VK_SUCCESS;
- +fail_dgc:
- + radv_device_finish_meta_etc_decode_state(device);
- fail_etc_decode:
- radv_device_finish_meta_fmask_copy_state(device);
- fail_fmask_copy:
- @@ -663,6 +669,7 @@ fail_clear:
- void
- radv_device_finish_meta(struct radv_device *device)
- {
- + radv_device_finish_dgc_prepare_state(device);
- radv_device_finish_meta_etc_decode_state(device);
- radv_device_finish_accel_struct_build_state(device);
- radv_device_finish_meta_clear_state(device);
- diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
- index 0f9388acd98..6ba7b8d286e 100644
- --- a/src/amd/vulkan/radv_meta.h
- +++ b/src/amd/vulkan/radv_meta.h
- @@ -107,6 +107,9 @@ void radv_device_finish_accel_struct_build_state(struct radv_device *device);
- VkResult radv_device_init_meta_etc_decode_state(struct radv_device *device, bool on_demand);
- void radv_device_finish_meta_etc_decode_state(struct radv_device *device);
- +VkResult radv_device_init_dgc_prepare_state(struct radv_device *device);
- +void radv_device_finish_dgc_prepare_state(struct radv_device *device);
- +
- void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer,
- uint32_t flags);
- diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
- index 9a6790090dc..6c1d78fec9e 100644
- --- a/src/amd/vulkan/radv_private.h
- +++ b/src/amd/vulkan/radv_private.h
- @@ -689,6 +689,12 @@ struct radv_meta_state {
- VkPipelineLayout p_layout;
- VkPipeline pipeline;
- } etc_decode;
- +
- + struct {
- + VkDescriptorSetLayout ds_layout;
- + VkPipelineLayout p_layout;
- + VkPipeline pipeline;
- + } dgc_prepare;
- };
- #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
- --
- 2.36.1
- From d686009428f84a8753a96dfb6b4b8cac931b2bf0 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Mon, 27 Jun 2022 23:21:08 +0200
- Subject: [PATCH 09/12] radv: Implement DGC generated command layout structure.
- ---
- .../vulkan/radv_device_generated_commands.c | 185 ++++++++++++++++++
- src/amd/vulkan/radv_private.h | 30 +++
- 2 files changed, 215 insertions(+)
- diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c
- index 68d8e4d5060..c9838b4839b 100644
- --- a/src/amd/vulkan/radv_device_generated_commands.c
- +++ b/src/amd/vulkan/radv_device_generated_commands.c
- @@ -26,6 +26,72 @@
- #include "nir_builder.h"
- +static void
- +radv_get_sequence_size(const struct radv_indirect_command_layout *layout,
- + const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size,
- + uint32_t *upload_size)
- +{
- + *cmd_size = 0;
- + *upload_size = 0;
- +
- + if (layout->bind_vbo_mask) {
- + *upload_size += 16 * util_bitcount(pipeline->vb_desc_usage_mask);
- + *cmd_size += 3 * 4;
- + }
- +
- + if (layout->push_constant_mask) {
- + bool need_copy = false;
- +
- + for (unsigned i = 0; i < ARRAY_SIZE(pipeline->base.shaders); ++i) {
- + if (!pipeline->base.shaders[i])
- + continue;
- +
- + struct radv_userdata_locations *locs = &pipeline->base.shaders[i]->info.user_sgprs_locs;
- + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
- + *cmd_size += 12;
- + need_copy = true;
- + }
- + if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0)
- + *cmd_size += 8 + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].num_sgprs * 4;
- + }
- + if (need_copy)
- + *upload_size +=
- + align(pipeline->base.push_constant_size + 16 * pipeline->base.dynamic_offset_count, 16);
- + }
- +
- + if (layout->binds_index_buffer)
- + *cmd_size += (3 + 2 + 3) * 4;
- + if (layout->indexed)
- + *cmd_size += (5 + 2 + 5) * 4;
- + else
- + *cmd_size += (5 + 2 + 3) * 4;
- +
- + if (layout->binds_state) {
- + *cmd_size += 3 * 4;
- +
- + if (pipeline->base.device->physical_device->rad_info.has_gfx9_scissor_bug)
- + *cmd_size += (8 + 2 * MAX_SCISSORS) * 4;
- + }
- +}
- +
- +static uint32_t
- +radv_align_cmdbuf_size(uint32_t size)
- +{
- + return align(MAX2(1, size), 256);
- +}
- +
- +uint32_t
- +radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
- +{
- + VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
- + VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
- + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
- +
- + uint32_t cmd_size, upload_size;
- + radv_get_sequence_size(layout, graphics_pipeline, &cmd_size, &upload_size);
- + return radv_align_cmdbuf_size(cmd_size * cmd_info->sequencesCount);
- +}
- +
- enum radv_dgc_token_type {
- RADV_DGC_INDEX_BUFFER,
- RADV_DGC_DRAW,
- @@ -890,3 +956,122 @@ fail:
- ralloc_free(cs);
- return result;
- }
- +
- +VkResult
- +radv_CreateIndirectCommandsLayoutNV(VkDevice _device,
- + const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo,
- + const VkAllocationCallbacks *pAllocator,
- + VkIndirectCommandsLayoutNV *pIndirectCommandsLayout)
- +{
- + RADV_FROM_HANDLE(radv_device, device, _device);
- + struct radv_indirect_command_layout *layout;
- +
- + size_t size =
- + sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);
- +
- + layout =
- + vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout),
- + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- + if (!layout)
- + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- +
- + vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);
- +
- + layout->input_stride = pCreateInfo->pStreamStrides[0];
- + layout->token_count = pCreateInfo->tokenCount;
- + typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);
- +
- + layout->indexed = false;
- + layout->binds_index_buffer = false;
- + layout->bind_vbo_mask = 0;
- + layout->push_constant_mask = 0;
- +
- + layout->ibo_type_32 = VK_INDEX_TYPE_UINT32;
- + layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_EXT;
- +
- + for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) {
- + switch (pCreateInfo->pTokens[i].tokenType) {
- + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
- + layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
- + break;
- + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
- + layout->indexed = true;
- + layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
- + break;
- + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
- + layout->binds_index_buffer = true;
- + layout->index_buffer_offset = pCreateInfo->pTokens[i].offset;
- + /* 16-bit is implied if we find no match. */
- + for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) {
- + if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32)
- + layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
- + else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_EXT)
- + layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
- + }
- + break;
- + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
- + layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit;
- + layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] =
- + pCreateInfo->pTokens[i].offset;
- + if (pCreateInfo->pTokens[i].vertexDynamicStride)
- + layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] |= 1u << 15;
- + break;
- + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV:
- + for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0;
- + k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) {
- + layout->push_constant_mask |= 1ull << j;
- + layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4;
- + }
- + break;
- + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV:
- + layout->binds_state = true;
- + layout->state_offset = pCreateInfo->pTokens[i].offset;
- + break;
- + default:
- + unreachable("Unhandled token type");
- + }
- + }
- + if (!layout->indexed)
- + layout->binds_index_buffer = false;
- +
- + *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
- + return VK_SUCCESS;
- +}
- +
- +void
- +radv_DestroyIndirectCommandsLayoutNV(VkDevice _device,
- + VkIndirectCommandsLayoutNV indirectCommandsLayout,
- + const VkAllocationCallbacks *pAllocator)
- +{
- + RADV_FROM_HANDLE(radv_device, device, _device);
- + VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
- +
- + if (!layout)
- + return;
- +
- + vk_object_base_finish(&layout->base);
- + vk_free2(&device->vk.alloc, pAllocator, layout);
- +}
- +
- +void
- +radv_GetGeneratedCommandsMemoryRequirementsNV(
- + VkDevice _device, const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo,
- + VkMemoryRequirements2 *pMemoryRequirements)
- +{
- + RADV_FROM_HANDLE(radv_device, device, _device);
- + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
- + VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
- + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
- +
- + uint32_t cmd_stride, upload_stride;
- + radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride);
- +
- + VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(cmd_stride * pInfo->maxSequencesCount);
- + VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount;
- +
- + pMemoryRequirements->memoryRequirements.memoryTypeBits =
- + device->physical_device->memory_types_32bit;
- + pMemoryRequirements->memoryRequirements.alignment = 256;
- + pMemoryRequirements->memoryRequirements.size =
- + align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
- +}
- diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
- index 6c1d78fec9e..38693974243 100644
- --- a/src/amd/vulkan/radv_private.h
- +++ b/src/amd/vulkan/radv_private.h
- @@ -2917,6 +2917,34 @@ void radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer);
- void radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
- const struct radv_barrier_data *barrier);
- +struct radv_indirect_command_layout {
- + struct vk_object_base base;
- +
- + uint32_t input_stride;
- + uint32_t token_count;
- +
- + bool indexed;
- + bool binds_index_buffer;
- + bool binds_state;
- + uint16_t draw_params_offset;
- + uint16_t index_buffer_offset;
- +
- + uint16_t state_offset;
- +
- + uint32_t bind_vbo_mask;
- + uint32_t vbo_offsets[MAX_VBS];
- +
- + uint64_t push_constant_mask;
- + uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4];
- +
- + uint32_t ibo_type_32;
- + uint32_t ibo_type_8;
- +
- + VkIndirectCommandsLayoutTokenNV tokens[0];
- +};
- +
- +uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
- +
- uint64_t radv_get_current_time(void);
- static inline uint32_t
- @@ -3150,6 +3178,8 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(radv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
- VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
- VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image_view, vk.base, VkImageView,
- VK_OBJECT_TYPE_IMAGE_VIEW);
- +VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, base, VkIndirectCommandsLayoutNV,
- + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV)
- VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_cache, base, VkPipelineCache,
- VK_OBJECT_TYPE_PIPELINE_CACHE)
- VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline, base, VkPipeline,
- --
- 2.36.1
- From 4f08d81a892e55f463611ee2544205c1a7da0945 Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Mon, 27 Jun 2022 23:28:14 +0200
- Subject: [PATCH 10/12] radv: Implement DGC cmdbuffer generation.
- ---
- .../vulkan/radv_device_generated_commands.c | 287 ++++++++++++++++++
- src/amd/vulkan/radv_private.h | 3 +
- 2 files changed, 290 insertions(+)
- diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c
- index c9838b4839b..3a8aa562e3a 100644
- --- a/src/amd/vulkan/radv_device_generated_commands.c
- +++ b/src/amd/vulkan/radv_device_generated_commands.c
- @@ -1075,3 +1075,290 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(
- pMemoryRequirements->memoryRequirements.size =
- align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
- }
- +
- +static uint32_t
- +radv_get_vgt_index_size(uint32_t type)
- +{
- + switch (type) {
- + case V_028A7C_VGT_INDEX_8:
- + return 1;
- + case V_028A7C_VGT_INDEX_16:
- + return 2;
- + case V_028A7C_VGT_INDEX_32:
- + return 4;
- + default:
- + unreachable("invalid index type");
- + }
- +}
- +
- +void
- +radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
- + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
- +{
- + /* Can't do anything here as we depend on some dynamic state in some cases that we only know
- + * at draw time. */
- +}
- +
- +/* Always need to call this directly before draw due to dependence on bound state. */
- +void
- +radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer,
- + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
- +{
- + VK_FROM_HANDLE(radv_indirect_command_layout, layout,
- + pGeneratedCommandsInfo->indirectCommandsLayout);
- + VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
- + VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
- + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
- + struct radv_meta_saved_state saved_state;
- + struct radv_buffer token_buffer;
- +
- + if (cmd_buffer->device->meta_state.dgc_prepare.pipeline == VK_NULL_HANDLE) {
- + radv_device_init_dgc_prepare_state(cmd_buffer->device);
- + }
- +
- + uint32_t cmd_stride, upload_stride;
- + radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride);
- +
- + unsigned cmd_buf_size =
- + radv_align_cmdbuf_size(cmd_stride * pGeneratedCommandsInfo->sequencesCount);
- +
- + radv_meta_save(
- + &saved_state, cmd_buffer,
- + RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
- +
- + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
- + cmd_buffer->device->meta_state.dgc_prepare.pipeline);
- +
- + unsigned vb_size = layout->bind_vbo_mask ? util_bitcount(graphics_pipeline->vb_desc_usage_mask) * 24 : 0;
- + unsigned const_size = graphics_pipeline->base.push_constant_size +
- + 16 * graphics_pipeline->base.dynamic_offset_count +
- + sizeof(layout->push_constant_offsets) + ARRAY_SIZE(graphics_pipeline->base.shaders) * 12;
- + if (!layout->push_constant_mask)
- + const_size = 0;
- +
- + unsigned scissor_size = (8 + 2 * cmd_buffer->state.dynamic.scissor.count) * 4;
- + if (!layout->binds_state || !cmd_buffer->state.dynamic.scissor.count ||
- + !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
- + scissor_size = 0;
- +
- + unsigned upload_size = MAX2(vb_size + const_size + scissor_size, 16);
- +
- + void *upload_data;
- + unsigned upload_offset;
- + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, upload_size, &upload_offset, &upload_data))
- + abort();
- +
- + void *upload_data_base = upload_data;
- +
- + radv_buffer_init(&token_buffer, cmd_buffer->device, cmd_buffer->upload.upload_bo, upload_size,
- + upload_offset);
- +
- + uint64_t upload_addr = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
- + pGeneratedCommandsInfo->preprocessOffset;
- +
- + uint16_t vtx_base_sgpr =
- + (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
- + if (cmd_buffer->state.graphics_pipeline->uses_drawid)
- + vtx_base_sgpr |= 1u << 14;
- + if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
- + vtx_base_sgpr |= 1u << 15;
- +
- + uint16_t vbo_sgpr =
- + ((radv_lookup_user_sgpr(&graphics_pipeline->base, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS)->sgpr_idx * 4 +
- + graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX]) -
- + SI_SH_REG_OFFSET) >>
- + 2;
- + struct radv_dgc_params params = {
- + .cmd_buf_stride = cmd_stride,
- + .cmd_buf_size = cmd_buf_size,
- + .upload_addr = (uint32_t)upload_addr,
- + .upload_stride = upload_stride,
- + .sequence_count = pGeneratedCommandsInfo->sequencesCount,
- + .stream_stride = layout->input_stride,
- + .draw_indexed = layout->indexed,
- + .draw_params_offset = layout->draw_params_offset,
- + .base_index_size =
- + layout->binds_index_buffer ? 0 : radv_get_vgt_index_size(cmd_buffer->state.index_type),
- + .vtx_base_sgpr = vtx_base_sgpr,
- + .max_index_count = cmd_buffer->state.max_index_count,
- + .index_buffer_offset = layout->index_buffer_offset,
- + .vbo_reg = vbo_sgpr,
- + .ibo_type_32 = layout->ibo_type_32,
- + .ibo_type_8 = layout->ibo_type_8,
- + .emit_state = layout->binds_state,
- + .pa_su_sc_mode_cntl_base = radv_get_pa_su_sc_mode_cntl(cmd_buffer) & C_028814_FACE,
- + .state_offset = layout->state_offset,
- + };
- +
- + if (layout->bind_vbo_mask) {
- + write_vertex_descriptors(cmd_buffer, graphics_pipeline, upload_data);
- +
- + uint32_t *vbo_info = (uint32_t *)((char *)upload_data + graphics_pipeline->vb_desc_alloc_size);
- +
- + struct radv_shader *vs_shader = radv_get_shader(&graphics_pipeline->base, MESA_SHADER_VERTEX);
- + const struct radv_vs_input_state *vs_state =
- + vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
- + uint32_t mask = graphics_pipeline->vb_desc_usage_mask;
- + unsigned idx = 0;
- + while (mask) {
- + unsigned i = u_bit_scan(&mask);
- + unsigned binding =
- + vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
- + : (graphics_pipeline->use_per_attribute_vb_descs ? graphics_pipeline->attrib_bindings[i] : i);
- + uint32_t attrib_end =
- + vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : graphics_pipeline->attrib_ends[i];
- +
- + params.vbo_bind_mask |= ((layout->bind_vbo_mask >> binding) & 1u) << idx;
- + vbo_info[2 * idx] = ((graphics_pipeline->use_per_attribute_vb_descs ? 1u : 0u) << 31) |
- + (vs_state ? vs_state->offsets[i] << 16 : 0) |
- + layout->vbo_offsets[binding];
- + vbo_info[2 * idx + 1] = graphics_pipeline->attrib_index_offset[i] | (attrib_end << 16);
- + ++idx;
- + }
- + params.vbo_cnt = idx;
- + upload_data = (char *)upload_data + vb_size;
- + }
- +
- + if (layout->push_constant_mask) {
- + uint32_t *desc = upload_data;
- + upload_data = (char *)upload_data + ARRAY_SIZE(graphics_pipeline->base.shaders) * 12;
- +
- + unsigned idx = 0;
- + for (unsigned i = 0; i < ARRAY_SIZE(graphics_pipeline->base.shaders); ++i) {
- + if (!graphics_pipeline->base.shaders[i])
- + continue;
- +
- + struct radv_userdata_locations *locs = &graphics_pipeline->base.shaders[i]->info.user_sgprs_locs;
- + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
- + params.const_copy = 1;
- +
- + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
- + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
- + unsigned upload_sgpr = 0;
- + unsigned inline_sgpr = 0;
- +
- + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
- + upload_sgpr =
- + (graphics_pipeline->base.user_data_0[i] + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
- + SI_SH_REG_OFFSET) >>
- + 2;
- + }
- +
- + if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
- + inline_sgpr = (graphics_pipeline->base.user_data_0[i] +
- + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
- + SI_SH_REG_OFFSET) >>
- + 2;
- + desc[idx * 3 + 1] = graphics_pipeline->base.shaders[i]->info.inline_push_constant_mask;
- + desc[idx * 3 + 2] = graphics_pipeline->base.shaders[i]->info.inline_push_constant_mask >> 32;
- + }
- + desc[idx * 3] = upload_sgpr | (inline_sgpr << 16);
- + ++idx;
- + }
- + }
- +
- + params.push_constant_shader_cnt = idx;
- +
- + params.const_copy_size = graphics_pipeline->base.push_constant_size +
- + 16 * graphics_pipeline->base.dynamic_offset_count;
- + params.push_constant_mask = layout->push_constant_mask;
- +
- + memcpy(upload_data, layout->push_constant_offsets, sizeof(layout->push_constant_offsets));
- + upload_data = (char *)upload_data + sizeof(layout->push_constant_offsets);
- +
- + memcpy(upload_data, cmd_buffer->push_constants, graphics_pipeline->base.push_constant_size);
- + upload_data = (char *)upload_data + graphics_pipeline->base.push_constant_size;
- +
- + struct radv_descriptor_state *descriptors_state =
- + radv_get_descriptors_state(cmd_buffer, pGeneratedCommandsInfo->pipelineBindPoint);
- + memcpy(upload_data, descriptors_state->dynamic_buffers, 16 * graphics_pipeline->base.dynamic_offset_count);
- + }
- +
- + if (scissor_size) {
- + params.scissor_offset = (char*)upload_data - (char*)upload_data_base;
- + params.scissor_count = scissor_size / 4;
- +
- + struct radeon_cmdbuf scissor_cs = {
- + .buf = upload_data,
- + .cdw = 0,
- + .max_dw = scissor_size / 4
- + };
- +
- + si_write_scissors(&scissor_cs, 0, cmd_buffer->state.dynamic.scissor.count,
- + cmd_buffer->state.dynamic.scissor.scissors,
- + cmd_buffer->state.dynamic.viewport.viewports,
- + cmd_buffer->state.emitted_graphics_pipeline->can_use_guardband);
- + assert(scissor_cs.cdw * 4 == scissor_size);
- + upload_data = (char *)upload_data + scissor_size;
- + }
- +
- + VkWriteDescriptorSet ds_writes[5];
- + VkDescriptorBufferInfo buf_info[ARRAY_SIZE(ds_writes)];
- + int ds_cnt = 0;
- + buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer),
- + .offset = 0,
- + .range = upload_size};
- + ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
- + .dstBinding = 3,
- + .dstArrayElement = 0,
- + .descriptorCount = 1,
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .pBufferInfo = &buf_info[ds_cnt]};
- + ++ds_cnt;
- +
- + buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->preprocessBuffer,
- + .offset = pGeneratedCommandsInfo->preprocessOffset,
- + .range = pGeneratedCommandsInfo->preprocessSize};
- + ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
- + .dstBinding = 2,
- + .dstArrayElement = 0,
- + .descriptorCount = 1,
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .pBufferInfo = &buf_info[ds_cnt]};
- + ++ds_cnt;
- +
- + if (pGeneratedCommandsInfo->streamCount > 0) {
- + buf_info[ds_cnt] =
- + (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->pStreams[0].buffer,
- + .offset = pGeneratedCommandsInfo->pStreams[0].offset,
- + .range = VK_WHOLE_SIZE};
- + ds_writes[ds_cnt] =
- + (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
- + .dstBinding = 1,
- + .dstArrayElement = 0,
- + .descriptorCount = 1,
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .pBufferInfo = &buf_info[ds_cnt]};
- + ++ds_cnt;
- + }
- +
- + if (pGeneratedCommandsInfo->sequencesCountBuffer != VK_NULL_HANDLE) {
- + buf_info[ds_cnt] =
- + (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->sequencesCountBuffer,
- + .offset = pGeneratedCommandsInfo->sequencesCountOffset,
- + .range = VK_WHOLE_SIZE};
- + ds_writes[ds_cnt] =
- + (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
- + .dstBinding = 4,
- + .dstArrayElement = 0,
- + .descriptorCount = 1,
- + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- + .pBufferInfo = &buf_info[ds_cnt]};
- + ++ds_cnt;
- + params.sequence_count = UINT32_MAX;
- + }
- +
- + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
- + cmd_buffer->device->meta_state.dgc_prepare.p_layout,
- + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), ¶ms);
- +
- + radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
- + cmd_buffer->device->meta_state.dgc_prepare.p_layout, 0, ds_cnt,
- + ds_writes);
- +
- + unsigned block_count = MAX2(1, round_up_u32(pGeneratedCommandsInfo->sequencesCount, 64));
- + radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
- +
- + radv_buffer_finish(&token_buffer);
- + radv_meta_restore(&saved_state, cmd_buffer);
- +}
- \ No newline at end of file
- diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
- index 38693974243..b4f088250ac 100644
- --- a/src/amd/vulkan/radv_private.h
- +++ b/src/amd/vulkan/radv_private.h
- @@ -2945,6 +2945,9 @@ struct radv_indirect_command_layout {
- uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
- +void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer,
- + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
- +
- uint64_t radv_get_current_time(void);
- static inline uint32_t
- --
- 2.36.1
- From c16a5dc84401af89283ecd3b8188cc30f788b94e Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Mon, 27 Jun 2022 23:29:04 +0200
- Subject: [PATCH 11/12] radv: Implement CmdExecuteGeneratedCommandsNV.
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 80 ++++++++++++++++++++++++++++++++
- 1 file changed, 80 insertions(+)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index eb68eaa53d1..be077eac52e 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -2673,6 +2673,11 @@ radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
- struct radeon_cmdbuf *cs = cmd_buffer->cs;
- struct radv_cmd_state *state = &cmd_buffer->state;
- + /* With indirect generated commands the index buffer bind may be part of the
- + * indirect command buffer, in which case the app may not have bound any yet. */
- + if (state->index_type < 0)
- + return;
- +
- /* For the direct indexed draws we use DRAW_INDEX_2, which includes
- * the index_va and max_index_count already. */
- if (!indirect)
- @@ -7375,6 +7380,81 @@ radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _bu
- radv_after_draw(cmd_buffer);
- }
- +void
- +radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
- + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
- +{
- + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
- + VK_FROM_HANDLE(radv_indirect_command_layout, layout,
- + pGeneratedCommandsInfo->indirectCommandsLayout);
- + VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
- +
- + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
- + cmd_buffer->state.flush_bits |=
- + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
- +
- + struct radv_draw_info info;
- +
- + info.count = pGeneratedCommandsInfo->sequencesCount;
- + info.indirect = prep_buffer; /* We're not not really goint use it this way but a good signal
- + that this is not direct. */
- + info.indirect_offset = 0;
- + info.stride = 0;
- + info.strmout_buffer = NULL;
- + info.count_buffer = NULL;
- + info.indexed = layout->indexed;
- + info.instance_count = 0;
- +
- + if (!radv_before_draw(cmd_buffer, &info, 1))
- + return;
- +
- + uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
- + uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
- + pGeneratedCommandsInfo->preprocessOffset;
- + const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
- +
- + if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
- + radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
- + radeon_emit(cmd_buffer->cs, 0);
- + }
- + if (!view_mask) {
- + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
- + radeon_emit(cmd_buffer->cs, va);
- + radeon_emit(cmd_buffer->cs, va >> 32);
- + radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
- + } else {
- + u_foreach_bit(view, view_mask)
- + {
- + radv_emit_view_index(cmd_buffer, view);
- +
- + radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
- + radeon_emit(cmd_buffer->cs, va);
- + radeon_emit(cmd_buffer->cs, va >> 32);
- + radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
- + }
- + }
- +
- + if (layout->binds_index_buffer) {
- + cmd_buffer->state.last_index_type = -1;
- + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
- + }
- +
- + if (layout->bind_vbo_mask)
- + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
- +
- + cmd_buffer->push_constant_stages |= ~0;
- +
- + cmd_buffer->state.last_primitive_reset_en = -1;
- + cmd_buffer->state.last_index_type = -1;
- + cmd_buffer->state.last_num_instances = -1;
- + cmd_buffer->state.last_vertex_offset = -1;
- + cmd_buffer->state.last_first_instance = -1;
- + cmd_buffer->state.last_drawid = -1;
- +
- + radv_after_draw(cmd_buffer);
- + return;
- +}
- +
- struct radv_dispatch_info {
- /**
- * Determine the layout of the grid (in block units) to be used.
- --
- 2.36.1
- From 3fd6f0403afebde6cf27c7f3fadbfa795853890f Mon Sep 17 00:00:00 2001
- From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
- Date: Fri, 7 Jan 2022 12:02:11 +0100
- Subject: [PATCH 12/12] radv: Expose VK_NV_device_generated_commands.
- ---
- src/amd/vulkan/radv_device.c | 25 +++++++++++++++++++++++++
- 1 file changed, 25 insertions(+)
- diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
- index 378d82c5765..e9e81a8ee0a 100644
- --- a/src/amd/vulkan/radv_device.c
- +++ b/src/amd/vulkan/radv_device.c
- @@ -576,6 +576,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
- .GOOGLE_user_type = true,
- .INTEL_shader_integer_functions2 = true,
- .NV_compute_shader_derivatives = true,
- + .NV_device_generated_commands = true,
- .NV_mesh_shader = device->use_ngg && device->rad_info.gfx_level >= GFX10_3 &&
- device->instance->perftest_flags & RADV_PERFTEST_NV_MS && !device->use_llvm,
- /* Undocumented extension purely for vkd3d-proton. This check is to prevent anyone else from
- @@ -1792,6 +1793,12 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
- features->borderColorSwizzleFromImage = true;
- break;
- }
- + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV: {
- + VkPhysicalDeviceDeviceGeneratedCommandsFeaturesNV *features =
- + (VkPhysicalDeviceDeviceGeneratedCommandsFeaturesNV *)ext;
- + features->deviceGeneratedCommands = true;
- + break;
- + }
- default:
- break;
- }
- @@ -2476,6 +2483,24 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
- break;
- }
- + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_PROPERTIES_NV: {
- + VkPhysicalDeviceDeviceGeneratedCommandsPropertiesNV *properties =
- + (VkPhysicalDeviceDeviceGeneratedCommandsPropertiesNV *)ext;
- + properties->maxIndirectCommandsStreamCount = 1;
- + properties->maxIndirectCommandsStreamStride = UINT32_MAX;
- + properties->maxIndirectCommandsTokenCount = UINT32_MAX;
- + properties->maxIndirectCommandsTokenOffset = UINT16_MAX;
- + properties->minIndirectCommandsBufferOffsetAlignment = 4;
- + properties->minSequencesCountBufferOffsetAlignment = 4;
- + properties->minSequencesIndexBufferOffsetAlignment = 4;
- +
- + /* Don't support even a shader group count = 1 until we support shader
- + * overrides during pipeline creation. */
- + properties->maxGraphicsShaderGroupCount = 0;
- +
- + properties->maxIndirectSequenceCount = UINT32_MAX;
- + break;
- + }
- default:
- break;
- }
- --
- 2.36.1
- From 2a22b602e05ec29b1ff5e8f9376e84df18794c52 Mon Sep 17 00:00:00 2001
- From: Hans-Kristian Arntzen <post@arntzen-software.no>
- Date: Fri, 10 Jun 2022 15:59:07 +0200
- Subject: [PATCH] radv: Flush SMEM/VMEM for indirects.
- Needed since DGC shaders use shader reads.
- Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
- ---
- src/amd/vulkan/radv_cmd_buffer.c | 5 ++---
- 1 file changed, 2 insertions(+), 3 deletions(-)
- diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
- index be077eac52e..99e75de162f 100644
- --- a/src/amd/vulkan/radv_cmd_buffer.c
- +++ b/src/amd/vulkan/radv_cmd_buffer.c
- @@ -4116,7 +4116,8 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_fla
- case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
- /* SCACHE potentially for reading the dispatch size from the shader. The
- * rest is for the DGC shader input. */
- - flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
- + flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
- + flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
- if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
- flush_bits |= RADV_CMD_FLAG_INV_L2;
- break;
- --
- GitLab
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement