[llvm] Reapply inline spiller subranges (PR #70194)

via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 25 04:01:24 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

Stack of patches to show the diff from the revert 

---

Patch is 93.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70194.diff


10 Files Affected:

- (modified) llvm/lib/CodeGen/InlineSpiller.cpp (+41-4) 
- (modified) llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir (+7-9) 
- (added) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+389) 
- (added) llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir (+143) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+177-193) 
- (modified) llvm/test/CodeGen/AMDGPU/swdev380865.ll (+39-54) 
- (modified) llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir (+2-2) 
- (modified) llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll (+15-27) 
- (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+26-31) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll (-2) 


``````````diff
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 46fcc62e09e8a8c..3a4c9853caec1bd 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -495,6 +495,31 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   return true;
 }
 
+/// Check if all subranges in \p LI and \p SLI have the same value number at \p
+/// Idx.
+static bool allSubRangeValNoSame(const LiveInterval &LI,
+                                 const LiveInterval &SLI,
+                                 const MachineInstr &MI,
+                                 const MachineRegisterInfo &MRI,
+                                 const TargetRegisterInfo &TRI, SlotIndex Idx) {
+  for (auto &SR : SLI.subranges()) {
+    VNInfo *SubVNI = SR.getVNInfoAt(Idx);
+
+    for (auto &SubLI : LI.subranges()) {
+      if (SubLI.LaneMask == SR.LaneMask) {
+        if (SubVNI != SubLI.getVNInfoAt(Idx))
+          return false;
+      } else if ((SubLI.LaneMask & SR.LaneMask).any()) {
+        // TODO: Check non-exact, overlapping subranges if they share the same
+        // def instruction
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
 /// redundant spills of this value in SLI.reg and sibling copies.
 void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
@@ -524,11 +549,23 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
       if (!MI.mayStore() && !TII.isCopyInstr(MI))
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
-      if (LI->getVNInfoAt(Idx) != VNI)
-        continue;
 
-      // Follow sibling copies down the dominator tree.
-      if (Register DstReg = isCopyOfBundle(MI, Reg, TII)) {
+      bool SameVNI = LI->getVNInfoAt(Idx) == VNI;
+
+      // The main range value numbers will differ if multiple instructions are
+      // used to define its various subregisters. Check the subregister value
+      // numbers as a fallback.
+      if (!SameVNI) {
+        if (!SLI.hasSubRanges() ||
+            !allSubRangeValNoSame(*LI, SLI, MI, MRI, TRI, Idx))
+          continue;
+      }
+
+      // Follow sibling copies down the dominator tree. Don't do this if we're
+      // relying on identical subranges to avoid infinitely recursing.
+      // TODO: Handle subrange case.
+      Register DstReg;
+      if (SameVNI && (DstReg = isCopyOfBundle(MI, Reg, TII))) {
         if (isSibling(DstReg)) {
           LiveInterval &DstLI = LIS.getInterval(DstReg);
           VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index 7209d160e6c8a7a..857784282528bc7 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -47,7 +47,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
+  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr57 = COPY $vgpr9, implicit $exec
   ; GCN-NEXT:   renamable $vgpr56 = COPY $vgpr8, implicit $exec
@@ -62,17 +62,15 @@ body:             |
   ; GCN-NEXT:   renamable $sgpr16_sgpr17 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.2 + 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.1, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0
-  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1, addrspace 5)
-  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2, addrspace 5)
+  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2 + 4, addrspace 5)
   ; GCN-NEXT:   renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
-  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
new file mode 100644
index 000000000000000..e06bed1774cd056
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -0,0 +1,389 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+
+define void @main(i1 %arg) #0 {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v1, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v1, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v1, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v1, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v1, s40, 6
+; CHECK-NEXT:    v_writelane_b32 v1, s41, 7
+; CHECK-NEXT:    v_writelane_b32 v1, s42, 8
+; CHECK-NEXT:    v_writelane_b32 v1, s43, 9
+; CHECK-NEXT:    v_writelane_b32 v1, s44, 10
+; CHECK-NEXT:    v_writelane_b32 v1, s45, 11
+; CHECK-NEXT:    v_writelane_b32 v1, s46, 12
+; CHECK-NEXT:    v_writelane_b32 v1, s47, 13
+; CHECK-NEXT:    v_writelane_b32 v1, s48, 14
+; CHECK-NEXT:    v_writelane_b32 v1, s49, 15
+; CHECK-NEXT:    s_getpc_b64 s[24:25]
+; CHECK-NEXT:    v_writelane_b32 v1, s50, 16
+; CHECK-NEXT:    s_movk_i32 s4, 0xf0
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    v_writelane_b32 v1, s51, 17
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
+; CHECK-NEXT:    s_movk_i32 s4, 0x130
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
+; CHECK-NEXT:    s_mov_b32 s20, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v5, s28
+; CHECK-NEXT:    v_mov_b32_e32 v6, v2
+; CHECK-NEXT:    s_mov_b32 s21, s20
+; CHECK-NEXT:    s_mov_b32 s22, s20
+; CHECK-NEXT:    s_mov_b32 s23, s20
+; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v1, s52, 18
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v1, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v1, s54, 20
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v1, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v1, s56, 22
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
+; CHECK-NEXT:    v_writelane_b32 v1, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v1, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v1, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v1, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v1, s61, 27
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v1, s62, 28
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v1, s63, 29
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v1, s64, 30
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v1, s65, 31
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v1, s66, 32
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    v_writelane_b32 v1, s67, 33
+; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
+; CHECK-NEXT:    s_mov_b32 s27, s24
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[26:27], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
+; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
+; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_3
+; CHECK-NEXT:  ; %bb.1: ; %bb48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 40
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 41
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 42
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 43
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 44
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 45
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 46
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 47
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_b64 vcc, exec, -1
+; CHECK-NEXT:  .LBB0_2: ; %bb50
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_mov_b32 s21, s20
+; CHECK-NEXT:    s_mov_b32 s22, s20
+; CHECK-NEXT:    s_mov_b32 s23, s20
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[60:67], s[20:23] dmask:0x1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v6
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v0
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v5
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:  .LBB0_3: ; %Flow14
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[28:29], s[26:27]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_10
+; CHECK-NEXT:  ; %bb.4: ; %bb32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
+; CHECK-NEXT:    s_xor_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_6
+; CHECK-NEXT:  ; %bb.5: ; %bb43
+; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s9, s8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s8
+; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, s9
+; CHECK-NEXT:    s_mov_b32 s10, s8
+; CHECK-NEXT:    s_mov_b32 s11, s8
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, v6
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_mov_b64 s[42:43], s[18:19]
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[16:17]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT:    v_readlane_b32 s12, v4, 16
+; CHECK-NEXT:    v_readlane_b32 s20, v4, 24
+; CHECK-NEXT:    v_readlane_b32 s21, v4, 25
+; CHECK-NEXT:    v_readlane_b32 s22, v4, 26
+; CHECK-NEXT:    v_readlane_b32 s23, v4, 27
+; CHECK-NEXT:    v_readlane_b32 s24, v4, 28
+; CHECK-NEXT:    v_readlane_b32 s25, v4, 29
+; CHECK-NEXT:    v_readlane_b32 s26, v4, 30
+; CHECK-NEXT:    v_readlane_b32 s27, v4, 31
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[20:27], s[4:7] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s13, v4, 17
+; CHECK-NEXT:    v_readlane_b32 s14, v4, 18
+; CHECK-NEXT:    v_readlane_b32 s15, v4, 19
+; CHECK-NEXT:    v_readlane_b32 s16, v4, 20
+; CHECK-NEXT:    v_readlane_b32 s17, v4, 21
+; CHECK-NEXT:    v_readlane_b32 s18, v4, 22
+; CHECK-NEXT:    v_readlane_b32 s19, v4, 23
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
+; CHECK-NEXT:  .LBB0_6: ; %Flow12
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB0_9
+; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
+; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    s_mov_b32 s6, s8
+; CHECK-NEXT:    s_mov_b32 s7, s8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    s_mov_b32 s9, s8
+; CHECK-NEXT:    s_mov_b32 s10, s8
+; CHECK-NEXT:    s_mov_b32 s11, s8
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_and_b64 vcc, exec, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_sub_f32_e32 v2, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:  .LBB0_8: ; %bb33
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_f32_e32 v3, v2, v0
+; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v3
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
+; CHECK-NEXT:  .LBB0_9: ; %Flow13
+; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[28:29]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s67, v1, 33
+; CHECK-NEXT:    v_readlane_b32 s66, v1, 32
+; CHECK-NEXT:    v_readlane_b32 s65, v1, 31
+; CHECK-NEXT:    v_readlane_b32 s64, v1, 30
+; CHECK-NEXT:    v_readlane_b32 s63, v1, 29
+; CHEC...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/70194


More information about the llvm-commits mailing list