[llvm] 2bca7d8 - [AMDGPU] Graceful abort for waterfalls in SIOptimizeVGPRLiveRange

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 11 22:16:20 PDT 2022


Author: Carl Ritson
Date: 2022-04-12T14:15:44+09:00
New Revision: 2bca7d859ac23226c79f06b9aade03b7b27297e0

URL: https://github.com/llvm/llvm-project/commit/2bca7d859ac23226c79f06b9aade03b7b27297e0
DIFF: https://github.com/llvm/llvm-project/commit/2bca7d859ac23226c79f06b9aade03b7b27297e0.diff

LOG: [AMDGPU] Graceful abort for waterfalls in SIOptimizeVGPRLiveRange

If the CFG structure of a waterfall loop is not the expected shape
then gracefully abort traversing the IR for the given loop.
This applies to nest waterfall loops which are not supported by
the VGPR live range optimizer.

Reviewed By: ruiling

Differential Revision: https://reviews.llvm.org/D123480

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
    llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 07232247c4bde..1cc7f79e39564 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -344,9 +344,13 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
     }
     if (MBB == LoopEnd)
       break;
-    assert(MBB->pred_size() == 1 ||
-           (MBB == LoopHeader && MBB->pred_size() == 2));
-    assert(MBB->succ_size() == 1);
+
+    if ((MBB != LoopHeader && MBB->pred_size() != 1) ||
+        (MBB == LoopHeader && MBB->pred_size() != 2) || MBB->succ_size() != 1) {
+      LLVM_DEBUG(dbgs() << "Unexpected edges in CFG, ignoring loop\n");
+      return;
+    }
+
     MBB = *MBB->succ_begin();
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index de39119700d4c..f7285051247cb 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -558,7 +558,129 @@ sw.bb18:                                          ; preds = %sw.bb, %if.then9
   ret void
 }
 
+%tex = type opaque
+define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.coerce) local_unnamed_addr {
+  ; SI-LABEL: name: nested_waterfalls
+  ; SI: bb.0.entry:
+  ; SI-NEXT:   successors: %bb.1(0x80000000)
+  ; SI-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1
+  ; SI-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
+  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset.cast, align 4, addrspace 4)
+  ; SI-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.1.if.then:
+  ; SI-NEXT:   successors: %bb.2(0x80000000)
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 3, killed [[REG_SEQUENCE]], implicit $exec
+  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_LSHLREV_B64_e64_]].sub0, 0, implicit $exec
+  ; SI-NEXT:   %69:vgpr_32, dead %71:sreg_32_xm0_xexec = V_ADDC_U32_e64 killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_LSHLREV_B64_e64_]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %69, %subreg.sub1
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_]], 16, 0, implicit $exec :: (load (s128) from %ir.6 + 16, addrspace 4)
+  ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
+  ; SI-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
+  ; SI-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub1
+  ; SI-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub0
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_]], 0, 0, implicit $exec :: (load (s128) from %ir.6, align 32, addrspace 4)
+  ; SI-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub3
+  ; SI-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub2
+  ; SI-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub1
+  ; SI-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_1]].sub0
+  ; SI-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3, killed [[COPY5]], %subreg.sub4, killed [[COPY4]], %subreg.sub5, killed [[COPY3]], %subreg.sub6, killed [[COPY2]], %subreg.sub7
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 killed [[GLOBAL_LOAD_DWORDX2_]], 48, 0, implicit $exec :: (load (s128) from %ir.8, addrspace 4)
+  ; SI-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.2:
+  ; SI-NEXT:   successors: %bb.3(0x80000000)
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_]], killed [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_]], killed [[V_CMP_EQ_U64_e64_2]], implicit-def dead $scc
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc
+  ; SI-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_4]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub7
+  ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.3:
+  ; SI-NEXT:   successors: %bb.4(0x80000000)
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.4:
+  ; SI-NEXT:   successors: %bb.5(0x80000000)
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE8:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_8]], %subreg.sub0, [[V_READFIRSTLANE_B32_9]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE9:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_10]], %subreg.sub0, [[V_READFIRSTLANE_B32_11]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_4]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc
+  ; SI-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_8]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub3
+  ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_3]], implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.5:
+  ; SI-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, [[REG_SEQUENCE7]], killed [[REG_SEQUENCE10]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource")
+  ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
+  ; SI-NEXT:   SI_WATERFALL_LOOP %bb.4, implicit $exec
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.6:
+  ; SI-NEXT:   successors: %bb.2(0x40000000), %bb.7(0x40000000)
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT:   $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]]
+  ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc
+  ; SI-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; SI-NEXT: {{  $}}
+  ; SI-NEXT: bb.7:
+  ; SI-NEXT:   $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]]
+  ; SI-NEXT:   GLOBAL_STORE_DWORD undef %30:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `float addrspace(1)* undef`, addrspace 1)
+  ; SI-NEXT:   S_ENDPGM 0
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %1 = zext i32 %0 to i64
+  br label %if.then
+
+if.then:                                          ; preds = %entry
+  %idx = getelementptr inbounds %tex*, %tex* addrspace(1)* %tex.coerce, i64 %1
+  %2 = load %tex*, %tex* addrspace(1)* %idx, align 8
+  %3 = bitcast %tex* %2 to i32*
+  %4 = addrspacecast i32* %3 to i32 addrspace(4)*
+  %add.ptr.i = getelementptr inbounds i32, i32 addrspace(4)* %4, i64 12
+  %5 = bitcast %tex* %2 to <8 x i32>*
+  %6 = addrspacecast <8 x i32>* %5 to <8 x i32> addrspace(4)*
+  %7 = load <8 x i32>, <8 x i32> addrspace(4)* %6, align 32
+  %8 = bitcast i32 addrspace(4)* %add.ptr.i to <4 x i32> addrspace(4)*
+  %9 = load <4 x i32>, <4 x i32> addrspace(4)* %8, align 16
+  %10 = tail call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float undef, float undef, <8 x i32> %7, <4 x i32> %9, i1 false, i32 0, i32 0)
+  store float %10, float addrspace(1)* undef, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
+declare float @llvm.amdgcn.image.sample.2d.f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind readonly willreturn }


        


More information about the llvm-commits mailing list