[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for double types - 2 (PR #189391)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Mar 30 07:13:18 PDT 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Aaditya (easyonaadit)

<details>
<summary>Changes</summary>

Supported Ops: `fadd` and `fsub`

---

Patch is 231.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189391.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+12-10) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll (+1008-130) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll (+1030-130) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dcc342638c5c1..94b27973b83f0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5745,6 +5745,8 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
   case AMDGPU::V_MIN_F64_e64:
   case AMDGPU::V_MAX_NUM_F64_e64:
   case AMDGPU::V_MAX_F64_e64:
+  case AMDGPU::V_ADD_F64_pseudo_e64:
+  case AMDGPU::V_ADD_F64_e64:
     DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
     break;
   default:
@@ -6579,17 +6581,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         }
         FinalDPPResult = RowBcast31;
       }
-      if (Opc == AMDGPU::V_SUB_F32_e64) {
-        Register NegatedValVGPR =
-            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_SUB_F32_e64),
+      if (Opc == AMDGPU::V_SUB_F32_e64 ||
+          MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
+        Register NegatedValVGPR = MRI.createVirtualRegister(SrcRegClass);
+        BuildMI(*CurrBB, MI, DL, TII->get(Opc),
                 NegatedValVGPR)
-            .addImm(SISrcMods::NONE)                    // src0 mods
-            .addReg(IdentityVGPR)                       // src0
-            .addImm(SISrcMods::NONE)                    // src1 mods
-            .addReg(IsWave32 ? RowBcast15 : RowBcast31) // src1
-            .addImm(SISrcMods::NONE)                    // clamp
-            .addImm(SISrcMods::NONE);                   // omod
+            .addImm(SISrcMods::NONE)                               // src0 mods
+            .addReg(IdentityVGPR)                                  // src0
+            .addImm(is32BitOpc ? SISrcMods::NONE : SISrcMods::NEG) // src1 mods
+            .addReg(IsWave32 ? RowBcast15 : RowBcast31)            // src1
+            .addImm(SISrcMods::NONE)                               // clamp
+            .addImm(SISrcMods::NONE);                              // omod
         FinalDPPResult = NegatedValVGPR;
       }
       // The final reduced value is in the last lane.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
index 8ece344b61234..1906687709c1e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
@@ -950,6 +950,884 @@ entry:
   ret void
 }
 
+define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) {
+; GFX8DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_f64 v[4:5], v[5:6], v[7:8]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_double_dpp:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_f64 v[4:5], v[5:6], v[7:8]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_f64 v[4:5], v[5:6], v[7:8]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_double_dpp:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_f64 v[4:5], v[5:6], v[7:8]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_s...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/189391


More information about the llvm-branch-commits mailing list