[llvm] 864a2b2 - [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting

Fri Mar 17 21:11:56 PDT 2023

Author: Austin Kerbow
Date: 2023-03-17T20:26:23-07:00
New Revision: 864a2b25beac507cc76b50030757283aae434c0c

URL: https://github.com/llvm/llvm-project/commit/864a2b25beac507cc76b50030757283aae434c0c
DIFF: https://github.com/llvm/llvm-project/commit/864a2b25beac507cc76b50030757283aae434c0c.diff

LOG: [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting

ASMPrinter was relying on feature bits to setup extra SGRPs in the knerel
descriptor for the xnack_mask. This was broken for the dynamic XNACK "any" TID
setting which could cause user SGPRs to be clobbered if the number of SGPRs
reserved was near a granulated block boundary.

When XNACK was enabled this worked correctly in the ASMParser which meant some
kernels were only failing without "-save-temps".

Fixes: SWDEV-382764

Reviewed By: kzhuravl

Differential Revision: https://reviews.llvm.org/D145401

Added: 
    llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
    llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
    llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll

Modified: 
    clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
    llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
    llvm/test/CodeGen/AMDGPU/trap-abis.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index cf0c15b6319f1..9403d12afa05a 100644

--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
 
 // expected-remark at +9 {{Function Name: foo}}
-// expected-remark at +8 {{    SGPRs: 9}}
+// expected-remark at +8 {{    SGPRs: 13}}
 // expected-remark at +7 {{    VGPRs: 10}}
 // expected-remark at +6 {{    AGPRs: 12}}
 // expected-remark at +5 {{    ScratchSize [bytes/lane]: 0}}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0883e7a5ed3a3..82c57dfcef0d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -251,9 +251,9 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
       CurrentProgramInfo.NumVGPRsForWavesPerEU,
       CurrentProgramInfo.NumSGPRsForWavesPerEU -
-          IsaInfo::getNumExtraSGPRs(&STM,
-                                    CurrentProgramInfo.VCCUsed,
-                                    CurrentProgramInfo.FlatUsed),
+          IsaInfo::getNumExtraSGPRs(
+              &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+              getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
       CodeObjectVersion);
 
@@ -721,7 +721,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
   // unified.
   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
-      &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
+      &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
+      getTargetStreamer()->getTargetID()->isXnackOnOrAny());
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 8e558b539fa72..e639fce9d690e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3061,7 +3061,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 12
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 9
+; GPRIDX-NEXT:     wavefront_sgpr_count = 13
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3913,7 +3913,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 0
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -3956,7 +3956,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 12
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 6
+; GPRIDX-NEXT:     wavefront_sgpr_count = 10
 ; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4259,7 +4259,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 0
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4302,7 +4302,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 12
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 7
+; GPRIDX-NEXT:     wavefront_sgpr_count = 11
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 14db2ab9c419c..824adbecfc3ae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s
 
 declare amdgpu_gfx float @extern_func(float) #0
 declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 3527329d1ee3b..5f3509c2517f7 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1010,GFX1010W32 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1010,GFX1010W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s

diff  --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 50a1d48b71304..2616b04332419 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,7 +2,7 @@
 ; RUN: FileCheck -check-prefix=REMARK %s < %t
 
 ; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0:     SGPRs: 24
+; STDERR-NEXT: remark: foo.cl:27:0:     SGPRs: 28
 ; STDERR-NEXT: remark: foo.cl:27:0:     VGPRs: 9
 ; STDERR-NEXT: remark: foo.cl:27:0:     AGPRs: 43
 ; STDERR-NEXT: remark: foo.cl:27:0:     ScratchSize [bytes/lane]: 0
@@ -27,7 +27,7 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          '    SGPRs: '
-; REMARK-NEXT:   - NumSGPR:         '24'
+; REMARK-NEXT:   - NumSGPR:         '28'
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
@@ -120,7 +120,7 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs: 0
+; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs: 4
 ; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0

diff  --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
new file mode 100644
index 0000000000000..2d2d64910c4fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefixes=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
+
+; TODO: Update to check for granulated sgpr count directive once one is added.
+
+define amdgpu_kernel void @kern() {
+; ASM-LABEL: kern:
+; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_reserve_xnack_mask 1
+
+; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
+; OBJ: Contents of section .rodata:
+; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
+
+; ELF: AMDGPU Metadata
+; ELF: .sgpr_count:     9
+entry:
+  tail call void asm sideeffect "", "~{s[0:4]}"()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}

diff  --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
new file mode 100644
index 0000000000000..e676f4f8de74d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack < %s | FileCheck --check-prefixes=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
+
+; TODO: Update to check for granulated sgpr count directive once one is added.
+
+define amdgpu_kernel void @kern() {
+; ASM-LABEL: kern:
+; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_reserve_xnack_mask 0
+
+; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
+; OBJ: Contents of section .rodata:
+; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000  ................
+
+; ELF: AMDGPU Metadata
+; ELF: .sgpr_count:     5
+entry:
+  tail call void asm sideeffect "", "~{s[0:4]}"()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}

diff  --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
new file mode 100644
index 0000000000000..705bedf450975
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
+
+; TODO: Update to check for granulated sgpr count directive once one is added.
+
+define amdgpu_kernel void @kern() {
+; ASM-LABEL: kern:
+; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_reserve_xnack_mask 1
+
+; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
+; OBJ: Contents of section .rodata:
+; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
+
+; ELF: AMDGPU Metadata
+; ELF: .sgpr_count:     9
+entry:
+  tail call void asm sideeffect "", "~{s[0:4]}"()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}

diff  --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 8cdd8ad002c69..c9987ac7831a4 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -17,7 +17,75 @@ declare void @llvm.debugtrap() #1
 
 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: trap:
-; NOHSA-TRAP-GFX900-V2:       ; %bb.0:
+; NOHSA-TRAP-GFX900-V2:         .amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_code_version_major = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_code_version_minor = 2
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_kind = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_major = 9
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_minor = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_stepping = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     priority = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     float_mode = 240
+; NOHSA-TRAP-GFX900-V2-NEXT:     priv = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_dx10_clamp = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_ieee_mode = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_wgp_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_mem_ordered = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_fwd_progress = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     user_sgpr_count = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_trap_handler = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_x = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_info = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_vgpr_workitem_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_exception_msb = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_lds_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_exception = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_buffer = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_dispatch_ptr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_queue_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_dispatch_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_flat_scratch_init = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_wavefront_size32 = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_ordered_append_gds = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     private_element_size = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_ptr64 = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_dynamic_callstack = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_debug_enabled = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_xnack_enabled = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     workitem_private_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     workgroup_group_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 44
+; NOHSA-TRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 8
+; NOHSA-TRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 2
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_sgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_private_segment_buffer_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     group_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     private_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     wavefront_size = 6
+; NOHSA-TRAP-GFX900-V2-NEXT:     call_convention = -1
+; NOHSA-TRAP-GFX900-V2-NEXT:     runtime_loader_kernel_symbol = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:    .end_amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT:  ; %bb.0:
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v1, 1
@@ -161,7 +229,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
 ; HSA-TRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
-; HSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; HSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 1
 ; HSA-TRAP-GFX900-V2-NEXT:     priority = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     float_mode = 240
 ; HSA-TRAP-GFX900-V2-NEXT:     priv = 0
@@ -204,7 +272,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 8
 ; HSA-TRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
-; HSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 8
+; HSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 12
 ; HSA-TRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 2
 ; HSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
@@ -261,7 +329,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
-; HSA-NOTRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; HSA-NOTRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 1
 ; HSA-NOTRAP-GFX900-V2-NEXT:     priority = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     float_mode = 240
 ; HSA-NOTRAP-GFX900-V2-NEXT:     priv = 0
@@ -304,7 +372,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-NOTRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 8
 ; HSA-NOTRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
-; HSA-NOTRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 8
+; HSA-NOTRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 12
 ; HSA-NOTRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 2
 ; HSA-NOTRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
@@ -356,7 +424,75 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 
 define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
 ; NOHSA-TRAP-GFX900-V2-LABEL: non_entry_trap:
-; NOHSA-TRAP-GFX900-V2:       ; %bb.0: ; %entry
+; NOHSA-TRAP-GFX900-V2:         .amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_code_version_major = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_code_version_minor = 2
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_kind = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_major = 9
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_minor = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_stepping = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     priority = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     float_mode = 240
+; NOHSA-TRAP-GFX900-V2-NEXT:     priv = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_dx10_clamp = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_ieee_mode = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_wgp_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_mem_ordered = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_fwd_progress = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     user_sgpr_count = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_trap_handler = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_x = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_info = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_vgpr_workitem_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_exception_msb = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_lds_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_exception = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_buffer = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_dispatch_ptr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_queue_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_dispatch_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_flat_scratch_init = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_wavefront_size32 = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_ordered_append_gds = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     private_element_size = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_ptr64 = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_dynamic_callstack = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_debug_enabled = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_xnack_enabled = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     workitem_private_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     workgroup_group_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 44
+; NOHSA-TRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 8
+; NOHSA-TRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 2
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_sgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_private_segment_buffer_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     group_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     private_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     wavefront_size = 6
+; NOHSA-TRAP-GFX900-V2-NEXT:     call_convention = -1
+; NOHSA-TRAP-GFX900-V2-NEXT:     runtime_loader_kernel_symbol = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:    .end_amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT:  ; %bb.0: ; %entry
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -591,7 +727,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 8
 ; HSA-TRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
-; HSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 10
+; HSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 12
 ; HSA-TRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 2
 ; HSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
@@ -712,7 +848,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-NOTRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 8
 ; HSA-NOTRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
-; HSA-NOTRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 10
+; HSA-NOTRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 12
 ; HSA-NOTRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 2
 ; HSA-NOTRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
@@ -792,7 +928,75 @@ ret:
 
 define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap:
-; NOHSA-TRAP-GFX900-V2:       ; %bb.0:
+; NOHSA-TRAP-GFX900-V2:         .amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_code_version_major = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_code_version_minor = 2
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_kind = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_major = 9
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_minor = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     amd_machine_version_stepping = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     priority = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     float_mode = 240
+; NOHSA-TRAP-GFX900-V2-NEXT:     priv = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_dx10_clamp = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_ieee_mode = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_wgp_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_mem_ordered = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_fwd_progress = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     user_sgpr_count = 2
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_trap_handler = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_x = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_id_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_workgroup_info = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_vgpr_workitem_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_exception_msb = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     granulated_lds_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_exception = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_buffer = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_dispatch_ptr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_queue_ptr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_dispatch_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_flat_scratch_init = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_private_segment_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_wavefront_size32 = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     enable_ordered_append_gds = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     private_element_size = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_ptr64 = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_dynamic_callstack = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_debug_enabled = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     is_xnack_enabled = 1
+; NOHSA-TRAP-GFX900-V2-NEXT:     workitem_private_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     workgroup_group_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 44
+; NOHSA-TRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 6
+; NOHSA-TRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 3
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_sgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     reserved_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     debug_private_segment_buffer_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     group_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     private_segment_alignment = 4
+; NOHSA-TRAP-GFX900-V2-NEXT:     wavefront_size = 6
+; NOHSA-TRAP-GFX900-V2-NEXT:     call_convention = -1
+; NOHSA-TRAP-GFX900-V2-NEXT:     runtime_loader_kernel_symbol = 0
+; NOHSA-TRAP-GFX900-V2-NEXT:    .end_amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT:  ; %bb.0:
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v1, 1
@@ -954,7 +1158,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
 ; HSA-TRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
-; HSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; HSA-TRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 1
 ; HSA-TRAP-GFX900-V2-NEXT:     priority = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     float_mode = 240
 ; HSA-TRAP-GFX900-V2-NEXT:     priv = 0
@@ -997,7 +1201,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 8
 ; HSA-TRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
-; HSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 6
+; HSA-TRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 10
 ; HSA-TRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 3
 ; HSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
 ; HSA-TRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0
@@ -1064,7 +1268,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernel_code_entry_byte_offset = 256
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernel_code_prefetch_byte_size = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     granulated_workitem_vgpr_count = 0
-; HSA-NOTRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 0
+; HSA-NOTRAP-GFX900-V2-NEXT:     granulated_wavefront_sgpr_count = 1
 ; HSA-NOTRAP-GFX900-V2-NEXT:     priority = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     float_mode = 240
 ; HSA-NOTRAP-GFX900-V2-NEXT:     priv = 0
@@ -1107,7 +1311,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-NOTRAP-GFX900-V2-NEXT:     gds_segment_byte_size = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     kernarg_segment_byte_size = 8
 ; HSA-NOTRAP-GFX900-V2-NEXT:     workgroup_fbarrier_count = 0
-; HSA-NOTRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 6
+; HSA-NOTRAP-GFX900-V2-NEXT:     wavefront_sgpr_count = 10
 ; HSA-NOTRAP-GFX900-V2-NEXT:     workitem_vgpr_count = 3
 ; HSA-NOTRAP-GFX900-V2-NEXT:     reserved_vgpr_first = 0
 ; HSA-NOTRAP-GFX900-V2-NEXT:     reserved_vgpr_count = 0