[llvm] 1e63f82 - AMDGPU: Fix an assertion in SIOptimizeVGPRLiveRange

Thu Apr 27 00:40:11 PDT 2023

Author: Nicolai Hähnle
Date: 2023-04-27T09:39:44+02:00
New Revision: 1e63f8272e8ad2b41f9c4d58250d7de264c4aa4b

URL: https://github.com/llvm/llvm-project/commit/1e63f8272e8ad2b41f9c4d58250d7de264c4aa4b
DIFF: https://github.com/llvm/llvm-project/commit/1e63f8272e8ad2b41f9c4d58250d7de264c4aa4b.diff

LOG: AMDGPU: Fix an assertion in SIOptimizeVGPRLiveRange

As the comment notes, the shader results in an INSERT_SUBREG with
"undef" (dead) operand in the Endif block. The same can happen with
REG_SEQUENCE. The register is considered dead from a liveness
analysis perspective. The correct thing to do seems to be nothing:
we keep the undef use of the register, the register allocator should
still be able to take the liveness into account correctly.

Differential Revision: https://reviews.llvm.org/D149161

Added: 
    llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll
    llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index ae2c10116de85..126f56100c64a 100644

--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -522,8 +522,15 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
     auto *UseBlock = UseMI->getParent();
     // Replace uses in Endif block
     if (UseBlock == Endif) {
-      assert(UseMI->isPHI() && "Uses should be PHI in Endif block");
-      O.setReg(NewReg);
+      if (UseMI->isPHI()) {
+        O.setReg(NewReg);
+      } else {
+        // DetectDeadLanes may mark register uses as undef without removing
+        // them, in which case a non-phi instruction using the original register
+        // may exist in the Endif block even though the register is not live
+        // into it.
+        assert(!O.readsReg());
+      }
       continue;
     }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll b/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll
new file mode 100644
index 0000000000000..a9cbcf073af61
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-deadlanes.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=false < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_main:
+;
+; This test case used to hit an assertion in SIOptimizeVGPRLiveRange because of a vector with dead lanes
+; leading to an effectively dead INSERT_SUBREG.
+
+
+define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 %descTable2) #0 {
+.entry:
+  %i2 = zext i32 %descTable2 to i64
+  %i4 = inttoptr i64 %i2 to ptr addrspace(4)
+  %i159 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
+  %i1540 = shufflevector <4 x float> %i159, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %i1526 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> poison, i32 2688, i32 0)
+  %i1746 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  br label %bb1750
+
+bb1750:                                           ; preds = %bb1897, %.entry
+  %__llpc_global_proxy_r3.12.vec.extract2358295 = phi i32 [ 0, %.entry ], [ %__llpc_global_proxy_r3.12.vec.extract2358, %bb1897 ]
+  %__llpc_global_proxy_r13.20293 = phi <4 x i32> [ undef, %.entry ], [ %__llpc_global_proxy_r13.22, %bb1897 ]
+  %__llpc_global_proxy_r10.19291 = phi <4 x i32> [ poison, %.entry ], [ %i1914, %bb1897 ]
+  %i1751 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i1746, i32 poison, i32 0, i32 0, i32 0)
+  %i1754 = shufflevector <4 x i32> %__llpc_global_proxy_r10.19291, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %__llpc_global_proxy_r7.12.vec.extract1953260 = bitcast float %i1751 to i32
+  %i1760 = or i32 %__llpc_global_proxy_r7.12.vec.extract1953260, %__llpc_global_proxy_r3.12.vec.extract2358295
+  %i1786 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> poison, i32 %i1760, i32 0)
+  %__llpc_global_proxy_r11.12.vec.insert1237 = insertelement <4 x i32> %i1754, i32 %i1786, i64 3
+  %.not2783 = icmp eq i32 %i1786, 0
+  br i1 %.not2783, label %bb1789, label %bb1787
+
+bb1787:                                           ; preds = %bb1750
+  %i1788 = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> %__llpc_global_proxy_r13.20293, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  br label %bb1897
+
+bb1789:                                           ; preds = %bb1750
+  %i1796 = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fmed3.f32(float poison, float 0.000000e+00, float 1.000000e+00)
+  %i1797 = fmul reassoc nnan nsz arcp contract afn float %i1796, %i1796
+  %i1800 = fdiv reassoc nnan nsz arcp contract afn float %i1797, 0.000000e+00
+  %i1801 = bitcast float %i1800 to i32
+  %__llpc_global_proxy_r11.12.vec.insert1245 = insertelement <4 x i32> %__llpc_global_proxy_r11.12.vec.insert1237, i32 poison, i64 3
+  %i1818 = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fmed3.f32(float poison, float 0.000000e+00, float 1.000000e+00)
+  %i1819 = bitcast float %i1818 to i32
+  %i1878 = shufflevector <4 x i32> %__llpc_global_proxy_r11.12.vec.insert1245, <4 x i32> poison, <3 x i32> <i32 3, i32 3, i32 3>
+  %i1879 = bitcast <3 x i32> %i1878 to <3 x float>
+  %i1881 = fmul reassoc nnan nsz arcp contract afn <3 x float> %i1540, %i1879
+  %i1882 = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> poison, i32 poison, i32 0)
+  %i1883 = shufflevector <3 x i32> %i1882, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %i1884 = bitcast <4 x i32> %i1883 to <4 x float>
+  %i1885 = shufflevector <4 x float> %i1884, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %i1886 = insertelement <3 x i32> undef, i32 %i1819, i64 0
+  %i1887 = bitcast <3 x i32> %i1886 to <3 x float>
+  %i1888 = insertelement <3 x i32> undef, i32 %i1801, i64 0
+  %i1889 = bitcast <3 x i32> %i1888 to <3 x float>
+  %i1890 = fmul reassoc nnan nsz arcp contract afn <3 x float> %i1887, %i1889
+  %i1891 = shufflevector <3 x float> %i1890, <3 x float> poison, <3 x i32> zeroinitializer
+  %i1892 = fmul reassoc nnan nsz arcp contract afn <3 x float> %i1885, %i1891
+  %i1893 = fmul reassoc nnan nsz arcp contract afn <3 x float> %i1892, %i1881
+  %i1894 = bitcast <3 x float> %i1893 to <3 x i32>
+  %i1895 = shufflevector <3 x i32> %i1894, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %i1896 = insertelement <4 x i32> %i1895, i32 %i1819, i64 3
+  br label %bb1897
+
+bb1897:                                           ; preds = %bb1789, %bb1787
+  %__llpc_global_proxy_r11.19 = phi <4 x i32> [ %__llpc_global_proxy_r11.12.vec.insert1237, %bb1787 ], [ %__llpc_global_proxy_r11.12.vec.insert1245, %bb1789 ]
+  %__llpc_global_proxy_r13.22 = phi <4 x i32> [ %i1788, %bb1787 ], [ %i1896, %bb1789 ]
+  %i1898 = shufflevector <4 x i32> %__llpc_global_proxy_r11.19, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %i1899 = bitcast <3 x i32> %i1898 to <3 x float>
+  %i1900 = shufflevector <4 x i32> %__llpc_global_proxy_r13.22, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %i1901 = bitcast <3 x i32> %i1900 to <3 x float>
+  %i1902 = fadd reassoc nnan nsz arcp contract afn <3 x float> %i1901, %i1899
+  %i1903 = bitcast <3 x float> %i1902 to <3 x i32>
+  %i1907 = shufflevector <3 x i32> %i1903, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %i1908 = shufflevector <4 x i32> %i1907, <4 x i32> %__llpc_global_proxy_r11.19, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %i1914 = shufflevector <4 x i32> %i1908, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %__llpc_global_proxy_r3.12.vec.extract2358 = extractelement <2 x i32> zeroinitializer, i64 1
+  %.not2780.not = icmp ult i32 %__llpc_global_proxy_r3.12.vec.extract2358, %i1526
+  br i1 %.not2780.not, label %bb1750, label %._crit_edge298
+
+._crit_edge298:                                   ; preds = %bb1897
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg)
+declare float @llvm.amdgcn.fmed3.f32(float, float, float)
+declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
+declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg)
+
+attributes #0 = { "target-features"=",+wavefrontsize64,+cumode" }

diff  --git a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir
new file mode 100644
index 0000000000000..71c408ced015b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=si-opt-vgpr-liverange -o - %s | FileCheck -check-prefixes=CHECK %s
+
+# Tests a case that used to assert in SIOptimizeVGPRLiveRange when trying to optimize %3 which still appears
+# (though in an undef operand) in the REG_SEQUENCE of the "endif block". This undef pattern was caused by
+# DetectDeadLanes.
+
+--- |
+  define dllexport amdgpu_ps void @_amdgpu_ps_main() #0 {
+    unreachable
+  }
+
+  attributes #0 = { "target-cpu"="gfx1100" "target-features"=",+wavefrontsize64,+cumode" "uniform-work-group-size"="false" }
+...
+---
+name:            _amdgpu_ps_main
+tracksRegLiveness: true
+registers:
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 5, class: vreg_128, preferred-register: '' }
+  - { id: 6, class: sgpr_128, preferred-register: '' }
+  - { id: 7, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%1' }
+body:             |
+  ; CHECK-LABEL: name: _amdgpu_ps_main
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN killed [[COPY]], undef %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[BUFFER_LOAD_DWORD_OFFEN]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, undef %4.sub3, %subreg.sub3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vreg_128 = PHI undef %10:vreg_128, %bb.0, [[REG_SEQUENCE]], %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[BUFFER_LOAD_DWORD_OFFEN]], %bb.0, undef %15:vgpr_32, %bb.1
+  ; CHECK-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, [[PHI1]], %subreg.sub1, [[PHI1]], %subreg.sub2, undef %6:vgpr_32, %subreg.sub3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:vreg_128 = PHI [[PHI]], %bb.2, [[REG_SEQUENCE1]], %bb.3
+  ; CHECK-NEXT:   SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   dead [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[PHI2]].sub2, %subreg.sub0, [[PHI2]].sub2, %subreg.sub1, [[PHI2]].sub2, %subreg.sub2, undef [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub3
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+    liveins: $vgpr0
+
+    %1:vgpr_32 = COPY killed $vgpr0
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %1, undef %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+    %4:sreg_64 = V_CMP_NE_U32_e64 0, %3, implicit $exec
+    %8:sreg_64 = SI_IF killed %4, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+
+    %9:vreg_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, undef %5.sub3:vreg_128, %subreg.sub3
+
+  bb.6:
+    successors: %bb.7(0x40000000), %bb.8(0x40000000)
+
+    %10:vreg_128 = PHI undef %156:vreg_128, %bb.0, %9, %bb.5
+    %11:sreg_64 = SI_ELSE killed %8, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.7
+
+  bb.7:
+    successors: %bb.8(0x80000000)
+
+    %12:vreg_128 = REG_SEQUENCE %3, %subreg.sub0, %3, %subreg.sub1, killed %3, %subreg.sub2, undef %7, %subreg.sub3
+
+  bb.8:
+    %13:vreg_128 = PHI %10, %bb.6, %12, %bb.7
+    SI_END_CF killed %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %5:vreg_128 = REG_SEQUENCE %13.sub2, %subreg.sub0, %13.sub2, %subreg.sub1, killed %13.sub2, %subreg.sub2, undef %3, %subreg.sub3
+    S_ENDPGM 0
+...