[llvm] [AMDGPU] Generate more swaps (PR #184164)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 14:58:08 PST 2026
https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/184164
>From a680c3011cb30b3ce7584cd2c390ec54ac3a45e7 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 26 Feb 2026 14:56:21 -0600
Subject: [PATCH 1/3] Generate more swaps
---
.../Target/AMDGPU/SIShrinkInstructions.cpp | 146 ++++++++++-------
.../atomic_optimizations_global_pointer.ll | 44 ++---
llvm/test/CodeGen/AMDGPU/v_swap_b16.ll | 154 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/v_swap_b32.mir | 102 ++++++++----
.../CodeGen/AMDGPU/whole-wave-functions.ll | 4 +-
5 files changed, 326 insertions(+), 124 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 14ed778f44f3a..9745f4bb41f75 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -698,6 +698,25 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs(
//
// This is really just a generic peephole that is not a canonical shrinking,
// although requirements match the pass placement and it reduces code size too.
+
+/*
+ movT T, X movT, X
+
+ ... <no writes to T>
+
+ movX X, Y
+
+ <no writes to T>
+ ... <no reads/writes to X>
+ <no writes to Y>
+
+ insertion-point =======> swap X, Y
+
+ ... <no writes to T>
+ <no reads/writes to Y>
+ movY, Y, T
+
+ */
MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
@@ -711,7 +730,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
return nullptr;
Register X = Xop.getReg();
unsigned Xsub = Xop.getSubReg();
-
+ Register Y;
+ unsigned Ysub;
+
unsigned Size = TII->getOpSize(MovT, 0);
// We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
@@ -724,73 +745,78 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
const unsigned SearchLimit = 16;
unsigned Count = 0;
- bool KilledT = false;
+
+ MachineInstr *MovX = nullptr;
+ MachineInstr *InsertionPt = nullptr;
+ MachineInstr *MovY = nullptr;
+
for (auto Iter = std::next(MovT.getIterator()),
E = MovT.getParent()->instr_end();
- Iter != E && Count < SearchLimit && !KilledT; ++Iter) {
-
- MachineInstr *MovY = &*Iter;
- KilledT = MovY->killsRegister(T, TRI);
- if (MovY->isDebugInstr())
- continue;
+ Iter != E && Count < SearchLimit; ++Iter) {
+ if (Iter->isDebugInstr())
+ continue;
++Count;
- if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
- MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
- MovY->getOpcode() != AMDGPU::COPY) ||
- !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T ||
- MovY->getOperand(1).getSubReg() != Tsub)
- continue;
-
- Register Y = MovY->getOperand(0).getReg();
- unsigned Ysub = MovY->getOperand(0).getSubReg();
-
- if (!TRI->isVGPR(*MRI, Y))
- continue;
-
- MachineInstr *MovX = nullptr;
- for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
- I != IY; ++I) {
- if (I->isDebugInstr())
- continue;
- if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
- instModifiesReg(&*I, T, Tsub) ||
- (MovX && instModifiesReg(&*I, X, Xsub))) {
- MovX = nullptr;
- break;
- }
- if (!instReadsReg(&*I, Y, Ysub)) {
- if (!MovX && instModifiesReg(&*I, X, Xsub)) {
- MovX = nullptr;
- break;
- }
- continue;
+ if (instModifiesReg(&*Iter, T, Tsub))
+ return nullptr;
+
+ if (!MovX) {
+ if ((Iter->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
+ Iter->getOpcode() == AMDGPU::COPY) &&
+ Iter->getOperand(0).getReg() == X &&
+ Iter->getOperand(0).getSubReg() == Xsub &&
+ Iter->getOperand(1).isReg()) {
+ MovX = &*Iter;
+ Y = MovX->getOperand(1).getReg();
+ Ysub = MovX->getOperand(1).getSubReg();
}
- if (MovX ||
- (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
- I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
- I->getOpcode() != AMDGPU::COPY) ||
- I->getOperand(0).getReg() != X ||
- I->getOperand(0).getSubReg() != Xsub) {
- MovX = nullptr;
- break;
+ } else {
+ if ((Iter->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
+ Iter->getOpcode() == AMDGPU::COPY) &&
+ Iter->getOperand(0).getReg() == Y &&
+ Iter->getOperand(0).getSubReg() == Ysub &&
+ Iter->getOperand(1).isReg() &&
+ Iter->getOperand(1).getReg() == T &&
+ Iter->getOperand(1).getSubReg() == Tsub) {
+ MovY=&*Iter;
+ break;
}
- if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
- continue;
-
- MovX = &*I;
+ // Effectively, mov X, Y must be moved downward
+ // and mov Y, T must be moved upward so that they can be fused into a swap.
+ // A write to Y creates a barrier that prevents the two moves from being moved
+ // adjacent to each other.
+ if (instModifiesReg(&*Iter, Y, Ysub))
+ return nullptr;
+
+ // Reads or writes to X prevent mov X, Y from being moved farther downward.
+ // Select this to be the insertion point.
+ if (!InsertionPt &&
+ (instReadsReg(&*Iter, X, Xsub) ||
+ instModifiesReg(&*Iter, X, Xsub))) {
+ InsertionPt = &*Iter;
+ }
+ // If the insertion point has been found, then mov Y, T must be moved upward
+ // past all subsequent instructions. A read of Y will block this movement.
+ if (InsertionPt) {
+ if (instReadsReg(&*Iter, Y, Ysub))
+ return nullptr;
+ }
}
-
- if (!MovX)
- continue;
-
+ }
+ if (MovY) {
LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY);
MachineBasicBlock &MBB = *MovT.getParent();
SmallVector<MachineInstr *, 4> Swaps;
+
+
+ if (!InsertionPt)
+ InsertionPt=MovY;
if (Size == 2) {
- auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+ auto *MIB = BuildMI(MBB, InsertionPt->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B16))
.addDef(X)
.addDef(Y)
@@ -804,7 +830,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
TargetInstrInfo::RegSubRegPair X1, Y1;
X1 = getSubRegForIndex(X, Xsub, I);
Y1 = getSubRegForIndex(Y, Ysub, I);
- auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+ auto *MIB = BuildMI(MBB, InsertionPt->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B32))
.addDef(X1.Reg, {}, X1.SubReg)
.addDef(Y1.Reg, {}, Y1.SubReg)
@@ -837,12 +863,15 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
}
}
- return Next;
+ return Next;
}
-
return nullptr;
}
+
+
+
+
// If an instruction has dead sdst replace it with NULL register on gfx1030+
bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
if (!ST->hasGFX10_3Insts())
@@ -905,6 +934,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
if (auto *NextMI = matchSwap(MI)) {
Next = NextMI->getIterator();
Changed = true;
+ dbgs() << "XXXXXXXXXXXXXXXXXXXX bingo\n";
continue;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index fbc8b812d96c9..01de2f36ec74d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -5081,8 +5081,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX9_DPP-NEXT: v_subrev_u32_e32 v4, s12, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v5
+; GFX9_DPP-NEXT: v_swap_b32 v3, v4
; GFX9_DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9_DPP-NEXT: buffer_wbinvl1_vol
@@ -5153,8 +5152,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1064_DPP-NEXT: v_subrev_nc_u32_e32 v5, s12, v6
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v5
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v6
+; GFX1064_DPP-NEXT: v_swap_b32 v4, v5
; GFX1064_DPP-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc
; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl1_inv
@@ -5216,8 +5214,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1032_DPP-NEXT: v_subrev_nc_u32_e32 v5, s9, v6
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v5
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v6
+; GFX1032_DPP-NEXT: v_swap_b32 v4, v5
; GFX1032_DPP-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc
; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl1_inv
@@ -5300,8 +5297,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1164_DPP-NEXT: v_subrev_nc_u32_e32 v5, s12, v6
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v5
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v6
+; GFX1164_DPP-NEXT: v_swap_b32 v4, v5
; GFX1164_DPP-NEXT: buffer_atomic_cmpswap_b32 v[4:5], off, s[4:7], 0 glc
; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl1_inv
@@ -5374,8 +5370,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1132_DPP-NEXT: v_subrev_nc_u32_e32 v5, s9, v6
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v5
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v6
+; GFX1132_DPP-NEXT: v_swap_b32 v4, v5
; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b32 v[4:5], off, s[4:7], 0 glc
; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl1_inv
@@ -7408,10 +7403,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX9_DPP-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v10
; GFX9_DPP-NEXT: v_subb_co_u32_e32 v9, vcc, v11, v0, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v8
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v9
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v10
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, v11
+; GFX9_DPP-NEXT: v_swap_b32 v6, v8
+; GFX9_DPP-NEXT: v_swap_b32 v7, v9
; GFX9_DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9_DPP-NEXT: buffer_wbinvl1_vol
@@ -7530,10 +7523,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX1064_DPP-NEXT: v_sub_co_u32 v10, vcc, v12, s8
; GFX1064_DPP-NEXT: v_subrev_co_ci_u32_e32 v11, vcc, s9, v13, vcc
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v10
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v11
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v12
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v13
+; GFX1064_DPP-NEXT: v_swap_b32 v8, v10
+; GFX1064_DPP-NEXT: v_swap_b32 v9, v11
; GFX1064_DPP-NEXT: buffer_atomic_cmpswap_x2 v[8:11], off, s[4:7], 0 glc
; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl1_inv
@@ -7633,10 +7624,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032_DPP-NEXT: v_sub_co_u32 v11, vcc_lo, v13, s8
; GFX1032_DPP-NEXT: v_subrev_co_ci_u32_e32 v12, vcc_lo, s9, v14, vcc_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v11
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v12
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v13
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v14
+; GFX1032_DPP-NEXT: v_swap_b32 v9, v11
+; GFX1032_DPP-NEXT: v_swap_b32 v10, v12
; GFX1032_DPP-NEXT: buffer_atomic_cmpswap_x2 v[9:12], off, s[4:7], 0 glc
; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl1_inv
@@ -7764,10 +7753,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v10
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v11
+; GFX1164_DPP-NEXT: v_swap_b32 v6, v8
+; GFX1164_DPP-NEXT: v_swap_b32 v7, v9
; GFX1164_DPP-NEXT: buffer_atomic_cmpswap_b64 v[6:9], off, s[4:7], 0 glc
; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl1_inv
@@ -7873,10 +7860,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_sub_co_u32 v10, vcc_lo, v12, s8
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_subrev_co_ci_u32_e64 v11, null, s9, v13, vcc_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v10
+; GFX1132_DPP-NEXT: v_swap_b32 v8, v10
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v13
+; GFX1132_DPP-NEXT: v_swap_b32 v9, v11
; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b64 v[8:11], off, s[4:7], 0 glc
; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl1_inv
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
index 55986328491ec..63e81ec9c5f11 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -110,3 +110,157 @@ loop:
ret:
ret half %x
}
+
+; Another use of swap operands (i.e. %y) does not block swap generation.
+define half @swap_B(half %a, half %b, half %c, i32 %i) {
+; GFX11-TRUE16-LABEL: swap_B:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB1_1: ; %loop
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v0.l
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: v_swap_b16 v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v1.l
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v2.l
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %ret
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: swap_B:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB1_1: ; %loop
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v3, -1, v3
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: v_swap_b32 v1, v0
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use v4
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use v2
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %ret
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: swap_B:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB1_1: ; %loop
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; GFX12-TRUE16-NEXT: ;;#ASMSTART
+; GFX12-TRUE16-NEXT: ; use v0.l
+; GFX12-TRUE16-NEXT: ;;#ASMEND
+; GFX12-TRUE16-NEXT: v_swap_b16 v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: ;;#ASMSTART
+; GFX12-TRUE16-NEXT: ; use v1.l
+; GFX12-TRUE16-NEXT: ;;#ASMEND
+; GFX12-TRUE16-NEXT: ;;#ASMSTART
+; GFX12-TRUE16-NEXT: ; use v2.l
+; GFX12-TRUE16-NEXT: ;;#ASMEND
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: swap_B:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB1_1: ; %loop
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v3, -1, v3
+; GFX12-FAKE16-NEXT: ;;#ASMSTART
+; GFX12-FAKE16-NEXT: ; use v0
+; GFX12-FAKE16-NEXT: ;;#ASMEND
+; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0
+; GFX12-FAKE16-NEXT: ;;#ASMSTART
+; GFX12-FAKE16-NEXT: ; use v4
+; GFX12-FAKE16-NEXT: ;;#ASMEND
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: ;;#ASMSTART
+; GFX12-FAKE16-NEXT: ; use v2
+; GFX12-FAKE16-NEXT: ;;#ASMEND
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop
+
+loop:
+ %t = phi half [%c, %entry], [%y, %loop]
+ %x = phi half [%a, %entry], [%y, %loop]
+ %y = phi half [%b, %entry], [%x, %loop]
+
+ %i2 = phi i32 [%i, %entry], [%i3, %loop]
+
+ call void asm sideeffect "; use $0", "v"(half %x)
+ call void asm sideeffect "; use $0", "v"(half %y)
+ call void asm sideeffect "; use $0", "v"(half %t)
+
+ %i3 = sub i32 %i2, 1
+
+ %cmp = icmp eq i32 %i3, 0
+
+ br i1 %cmp, label %ret, label %loop
+
+ret:
+ ret half %x
+}
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
index 27229cd518028..6d00a38832145 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
@@ -27,8 +27,8 @@ body: |
# GCN-NEXT: {{^[ ]*$}}
# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
-# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
# GCN-NEXT: S_SETPC_B64_return
---
name: swap_phys_sparse
@@ -106,8 +106,7 @@ body: |
# GCN-NEXT: {{^[ ]*$}}
# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3_vgpr4 = V_ADD_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $mode, implicit $exec
-# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
-# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
---
name: swap_phys_overlap_x
tracksRegLiveness: true
@@ -221,10 +220,8 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %2:vgpr_32 = COPY %0
# GCN-NEXT: %3:vgpr_32 = COPY %0
-# GCN-NEXT: %0:vgpr_32 = COPY %1
-# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
# GCN-NEXT: S_ENDPGM 0
---
@@ -306,10 +303,8 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %2:vgpr_32 = COPY %0
-# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %1:vgpr_32 = COPY %2
# GCN-NEXT: S_ENDPGM 0
---
@@ -334,10 +329,8 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %2:vgpr_32 = COPY %0
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %0:vgpr_32 = COPY %1
-# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
# GCN-NEXT: S_ENDPGM 0
---
@@ -390,10 +383,8 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
# GCN-NEXT: %3:vreg_64 = COPY %0
-# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0
-# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
---
name: swap_virt_copy_subreg_overlap_x_full
tracksRegLiveness: true
@@ -416,10 +407,8 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
# GCN-NEXT: %3:vreg_64 = COPY %0.sub0_sub1
-# GCN-NEXT: %0.sub0:vreg_128 = COPY %1.sub0
-# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+# GCN-NEXT: %0.sub0:vreg_128, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
---
name: swap_virt_copy_subreg_overlap_x_part
tracksRegLiveness: true
@@ -582,6 +571,7 @@ body: |
# GCN-LABEL: name: swap_virt_copy_subreg_impdef_super
# GCN: %2:vreg_64 = IMPLICIT_DEF
# GCN-NEXT: %2.sub1:vreg_64 = COPY %0.sub1
+# GCN-NEXT: %0.sub1:vreg_64 = COPY %1.sub1
# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
---
name: swap_virt_copy_subreg_impdef_super
@@ -605,11 +595,9 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
# GCN-NEXT: %2.sub1:vreg_64 = COPY %0.sub1
-# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0, implicit %0
# GCN-NEXT: %0.sub1:vreg_64 = COPY %1.sub1
-# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
# GCN-NEXT: S_ENDPGM 0
---
name: swap_virt_copy_subreg_impuse_x
@@ -787,11 +775,10 @@ body: |
...
# GCN-LABEL: name: swap_killed_t_early
-# GCN: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit killed $vgpr2
-# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
-# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 undef $vgpr2, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
---
name: swap_killed_t_early
@@ -808,11 +795,10 @@ body: |
...
# GCN-LABEL: name: swap_killed_t_late
-# GCN: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
-# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit killed $vgpr2
-# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 undef $vgpr2, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
---
name: swap_killed_t_late
@@ -829,11 +815,10 @@ body: |
...
# GCN-LABEL: name: swap_killed_x
-# GCN: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
-# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit killed $vgpr0
-# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
---
name: swap_killed_x
@@ -918,13 +903,12 @@ body: |
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
...
-# GCN-LABEL: name: implict_ops_mov_x_swap_b64
-# GCN: %2:vreg_64 = COPY %0
-# GCN-NEXT: %0:vreg_64 = COPY %1, implicit $vgpr0
-# GCN-NEXT: %1:vreg_64 = COPY %2
+# GCN-LABEL: name: implicit_ops_mov_x_swap_b64
+# GCN: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_64, %1.sub1:vreg_64 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
---
-name: implict_ops_mov_x_swap_b64
+name: implicit_ops_mov_x_swap_b64
tracksRegLiveness: true
body: |
bb.0:
@@ -967,3 +951,51 @@ body: |
$vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $vgpr2, implicit-def $vgpr0_vgpr1, implicit killed $vgpr3
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
...
+
+################################################################
+# In:
+#
+# MOV T, X
+# MOV X, Y
+# MOV Y, T
+#
+# ensure that intervening uses/defs of T, X, Y that allow swap
+# semantics to be preserved will not block swap generation.
+################################################################
+
+# GCN-LABEL: name: swap_allow_use_def_and_flexible_insertion_point
+# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_LSHLREV_B32_e32 $vgpr2, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e32 $vgpr2, $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e32 1, $vgpr2, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e32 1, $vgpr0, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+
+---
+name: swap_allow_use_def_and_flexible_insertion_point
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+ ; MOV T, X
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+ ; Writes to X, Y and reads of T, X, Y do not block swap generation
+ $vgpr0 = V_LSHLREV_B32_e64 $vgpr2, $vgpr0, implicit $exec
+ $vgpr1 = V_LSHLREV_B32_e64 $vgpr2, $vgpr1, implicit $exec
+
+ ; MOV X, Y
+ $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
+
+ ; Read of T before insertion point does not block swap generation
+ $vgpr3 = V_LSHLREV_B32_e64 1, $vgpr2, implicit $exec
+ ; SWAP X, Y will be inserted here
+ ; Read of X after insertion point does not block swap generatoin
+ $vgpr3 = V_LSHLREV_B32_e64 1, $vgpr0, implicit $exec
+
+ ; MOV Y, T
+ $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec
+
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 37105efa3333c..3783891ab7f48 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -5144,9 +5144,9 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GISEL-NEXT: scratch_store_b32 off, v247, s32 offset:572
; GISEL-NEXT: s_mov_b32 exec_lo, -1
; GISEL-NEXT: v_mov_b32_e32 v2, v0
-; GISEL-NEXT: v_swap_b32 v0, v1
; GISEL-NEXT: s_mov_b32 s36, gfx_callee at abs32@lo
; GISEL-NEXT: s_mov_b32 s37, gfx_callee at abs32@hi
+; GISEL-NEXT: v_swap_b32 v0, v1
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload
@@ -5776,9 +5776,9 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GISEL64-NEXT: scratch_store_b32 off, v247, s32 offset:572
; GISEL64-NEXT: s_mov_b64 exec, -1
; GISEL64-NEXT: v_mov_b32_e32 v2, v0
-; GISEL64-NEXT: v_swap_b32 v0, v1
; GISEL64-NEXT: s_mov_b32 s36, gfx_callee at abs32@lo
; GISEL64-NEXT: s_mov_b32 s37, gfx_callee at abs32@hi
+; GISEL64-NEXT: v_swap_b32 v0, v1
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload
>From 219de1069c005a08bf64daea72445577bc8a27b8 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 2 Mar 2026 10:27:45 -0600
Subject: [PATCH 2/3] Format code
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../Target/AMDGPU/SIShrinkInstructions.cpp | 105 +++++++-----------
1 file changed, 41 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 9745f4bb41f75..2f0597a466dc1 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -698,25 +698,6 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs(
//
// This is really just a generic peephole that is not a canonical shrinking,
// although requirements match the pass placement and it reduces code size too.
-
-/*
- movT T, X movT, X
-
- ... <no writes to T>
-
- movX X, Y
-
- <no writes to T>
- ... <no reads/writes to X>
- <no writes to Y>
-
- insertion-point =======> swap X, Y
-
- ... <no writes to T>
- <no reads/writes to Y>
- movY, Y, T
-
- */
MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
@@ -732,7 +713,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
unsigned Xsub = Xop.getSubReg();
Register Y;
unsigned Ysub;
-
+
unsigned Size = TII->getOpSize(MovT, 0);
// We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
@@ -745,64 +726,66 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
const unsigned SearchLimit = 16;
unsigned Count = 0;
-
+
MachineInstr *MovX = nullptr;
- MachineInstr *InsertionPt = nullptr;
+ MachineInstr *InsertionPt = nullptr;
MachineInstr *MovY = nullptr;
-
+
for (auto Iter = std::next(MovT.getIterator()),
E = MovT.getParent()->instr_end();
Iter != E && Count < SearchLimit; ++Iter) {
if (Iter->isDebugInstr())
- continue;
+ continue;
++Count;
if (instModifiesReg(&*Iter, T, Tsub))
return nullptr;
-
+
if (!MovX) {
+ // Search for mov x, y.
if ((Iter->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
- Iter->getOpcode() == AMDGPU::COPY) &&
- Iter->getOperand(0).getReg() == X &&
- Iter->getOperand(0).getSubReg() == Xsub &&
- Iter->getOperand(1).isReg()) {
- MovX = &*Iter;
- Y = MovX->getOperand(1).getReg();
- Ysub = MovX->getOperand(1).getSubReg();
+ Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
+ Iter->getOpcode() == AMDGPU::COPY) &&
+ Iter->getOperand(0).getReg() == X &&
+ Iter->getOperand(0).getSubReg() == Xsub &&
+ Iter->getOperand(1).isReg()) {
+ MovX = &*Iter;
+ Y = MovX->getOperand(1).getReg();
+ Ysub = MovX->getOperand(1).getSubReg();
}
} else {
+ // mov x, y has been found.
+ // Search for mov y, t.
if ((Iter->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
- Iter->getOpcode() == AMDGPU::COPY) &&
- Iter->getOperand(0).getReg() == Y &&
- Iter->getOperand(0).getSubReg() == Ysub &&
- Iter->getOperand(1).isReg() &&
- Iter->getOperand(1).getReg() == T &&
- Iter->getOperand(1).getSubReg() == Tsub) {
- MovY=&*Iter;
- break;
+ Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
+ Iter->getOpcode() == AMDGPU::COPY) &&
+ Iter->getOperand(0).getReg() == Y &&
+ Iter->getOperand(0).getSubReg() == Ysub &&
+ Iter->getOperand(1).isReg() && Iter->getOperand(1).getReg() == T &&
+ Iter->getOperand(1).getSubReg() == Tsub) {
+ MovY = &*Iter;
+ break;
}
- // Effectively, mov X, Y must be moved downward
- // and mov Y, T must be moved upward so that they can be fused into a swap.
- // A write to Y creates a barrier that prevents the two moves from being moved
- // adjacent to each other.
+ // Effectively, mov x, y must be moved downward
+ // and mov y, t must be moved upward so that they can be fused into a
+ // swap. A write to y creates a barrier that prevents the two moves from
+ // being moved adjacent to each other.
if (instModifiesReg(&*Iter, Y, Ysub))
- return nullptr;
+ return nullptr;
- // Reads or writes to X prevent mov X, Y from being moved farther downward.
- // Select this to be the insertion point.
+ // Reads or writes to x prevent mov x, y from being moved farther
+ // downward. Select this to be the insertion point.
if (!InsertionPt &&
- (instReadsReg(&*Iter, X, Xsub) ||
- instModifiesReg(&*Iter, X, Xsub))) {
- InsertionPt = &*Iter;
+ (instReadsReg(&*Iter, X, Xsub) || instModifiesReg(&*Iter, X, Xsub))) {
+ InsertionPt = &*Iter;
}
- // If the insertion point has been found, then mov Y, T must be moved upward
- // past all subsequent instructions. A read of Y will block this movement.
+ // If the insertion point has been found, then mov y, t must be moved
+ // upward past all subsequent instructions. A read of y will block this
+ // movement.
if (InsertionPt) {
- if (instReadsReg(&*Iter, Y, Ysub))
- return nullptr;
+ if (instReadsReg(&*Iter, Y, Ysub))
+ return nullptr;
}
}
}
@@ -812,9 +795,8 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
MachineBasicBlock &MBB = *MovT.getParent();
SmallVector<MachineInstr *, 4> Swaps;
-
if (!InsertionPt)
- InsertionPt=MovY;
+ InsertionPt = MovY;
if (Size == 2) {
auto *MIB = BuildMI(MBB, InsertionPt->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B16))
@@ -863,15 +845,11 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
}
}
- return Next;
+ return Next;
}
return nullptr;
}
-
-
-
-
// If an instruction has dead sdst replace it with NULL register on gfx1030+
bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
if (!ST->hasGFX10_3Insts())
@@ -934,7 +912,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
if (auto *NextMI = matchSwap(MI)) {
Next = NextMI->getIterator();
Changed = true;
- dbgs() << "XXXXXXXXXXXXXXXXXXXX bingo\n";
continue;
}
}
>From ec686f368ca0a9a4162568e577fc6d9a81303342 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 2 Mar 2026 16:07:44 -0600
Subject: [PATCH 3/3] Fix register analysis
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 3 +++
llvm/test/CodeGen/AMDGPU/v_swap_b32.mir | 10 ++++++----
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 2f0597a466dc1..e59c310eab898 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -752,6 +752,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
MovX = &*Iter;
Y = MovX->getOperand(1).getReg();
Ysub = MovX->getOperand(1).getSubReg();
+ } else if (instModifiesReg(&*Iter, X, Xsub)) {
+ // Writes to x are not allowed until mov x, y has been found
+ return nullptr;
}
} else {
// mov x, y has been found.
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
index 6d00a38832145..43192d04a0e31 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
@@ -329,8 +329,10 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = COPY %2
# GCN-NEXT: S_ENDPGM 0
---
@@ -965,7 +967,7 @@ body: |
# GCN-LABEL: name: swap_allow_use_def_and_flexible_insertion_point
# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-# GCN-NEXT: $vgpr0 = V_LSHLREV_B32_e32 $vgpr2, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e32 $vgpr2, $vgpr0, implicit $exec
# GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e32 $vgpr2, $vgpr1, implicit $exec
# GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e32 1, $vgpr2, implicit $exec
# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
@@ -981,8 +983,8 @@ body: |
; MOV T, X
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
- ; Writes to X, Y and reads of T, X, Y do not block swap generation
- $vgpr0 = V_LSHLREV_B32_e64 $vgpr2, $vgpr0, implicit $exec
+ ; Writes to Y and reads of T, X, Y do not block swap generation
+ $vgpr1 = V_LSHLREV_B32_e64 $vgpr2, $vgpr0, implicit $exec
$vgpr1 = V_LSHLREV_B32_e64 $vgpr2, $vgpr1, implicit $exec
; MOV X, Y
More information about the llvm-commits
mailing list