[llvm-branch-commits] [llvm] 0c15b99 - Revert "[AMDGPU] Generate more swaps (#184164)"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Mar 20 08:46:48 PDT 2026
Author: LU-JOHN
Date: 2026-03-20T10:46:44-05:00
New Revision: 0c15b99e3f649275d85ee7358b01a04a0ee49e8d
URL: https://github.com/llvm/llvm-project/commit/0c15b99e3f649275d85ee7358b01a04a0ee49e8d
DIFF: https://github.com/llvm/llvm-project/commit/0c15b99e3f649275d85ee7358b01a04a0ee49e8d.diff
LOG: Revert "[AMDGPU] Generate more swaps (#184164)"
This reverts commit 81396ebc51c40214465111ede745147989c67e48.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index e59c310eab898..14ed778f44f3a 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -711,8 +711,6 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
return nullptr;
Register X = Xop.getReg();
unsigned Xsub = Xop.getSubReg();
- Register Y;
- unsigned Ysub;
unsigned Size = TII->getOpSize(MovT, 0);
@@ -726,82 +724,73 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
const unsigned SearchLimit = 16;
unsigned Count = 0;
-
- MachineInstr *MovX = nullptr;
- MachineInstr *InsertionPt = nullptr;
- MachineInstr *MovY = nullptr;
-
+ bool KilledT = false;
for (auto Iter = std::next(MovT.getIterator()),
E = MovT.getParent()->instr_end();
- Iter != E && Count < SearchLimit; ++Iter) {
- if (Iter->isDebugInstr())
+ Iter != E && Count < SearchLimit && !KilledT; ++Iter) {
+
+ MachineInstr *MovY = &*Iter;
+ KilledT = MovY->killsRegister(T, TRI);
+ if (MovY->isDebugInstr())
continue;
++Count;
- if (instModifiesReg(&*Iter, T, Tsub))
- return nullptr;
-
- if (!MovX) {
- // Search for mov x, y.
- if ((Iter->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
- Iter->getOpcode() == AMDGPU::COPY) &&
- Iter->getOperand(0).getReg() == X &&
- Iter->getOperand(0).getSubReg() == Xsub &&
- Iter->getOperand(1).isReg()) {
- MovX = &*Iter;
- Y = MovX->getOperand(1).getReg();
- Ysub = MovX->getOperand(1).getSubReg();
- } else if (instModifiesReg(&*Iter, X, Xsub)) {
- // Writes to x are not allowed until mov x, y has been found
- return nullptr;
- }
- } else {
- // mov x, y has been found.
- // Search for mov y, t.
- if ((Iter->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- Iter->getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
- Iter->getOpcode() == AMDGPU::COPY) &&
- Iter->getOperand(0).getReg() == Y &&
- Iter->getOperand(0).getSubReg() == Ysub &&
- Iter->getOperand(1).isReg() && Iter->getOperand(1).getReg() == T &&
- Iter->getOperand(1).getSubReg() == Tsub) {
- MovY = &*Iter;
+ if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
+ MovY->getOpcode() != AMDGPU::COPY) ||
+ !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T ||
+ MovY->getOperand(1).getSubReg() != Tsub)
+ continue;
+
+ Register Y = MovY->getOperand(0).getReg();
+ unsigned Ysub = MovY->getOperand(0).getSubReg();
+
+ if (!TRI->isVGPR(*MRI, Y))
+ continue;
+
+ MachineInstr *MovX = nullptr;
+ for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
+ I != IY; ++I) {
+ if (I->isDebugInstr())
+ continue;
+ if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
+ instModifiesReg(&*I, T, Tsub) ||
+ (MovX && instModifiesReg(&*I, X, Xsub))) {
+ MovX = nullptr;
break;
}
-
- // Effectively, mov x, y must be moved downward
- // and mov y, t must be moved upward so that they can be fused into a
- // swap. A write to y creates a barrier that prevents the two moves from
- // being moved adjacent to each other.
- if (instModifiesReg(&*Iter, Y, Ysub))
- return nullptr;
-
- // Reads or writes to x prevent mov x, y from being moved farther
- // downward. Select this to be the insertion point.
- if (!InsertionPt &&
- (instReadsReg(&*Iter, X, Xsub) || instModifiesReg(&*Iter, X, Xsub))) {
- InsertionPt = &*Iter;
+ if (!instReadsReg(&*I, Y, Ysub)) {
+ if (!MovX && instModifiesReg(&*I, X, Xsub)) {
+ MovX = nullptr;
+ break;
+ }
+ continue;
}
- // If the insertion point has been found, then mov y, t must be moved
- // upward past all subsequent instructions. A read of y will block this
- // movement.
- if (InsertionPt) {
- if (instReadsReg(&*Iter, Y, Ysub))
- return nullptr;
+ if (MovX ||
+ (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
+ I->getOpcode() != AMDGPU::COPY) ||
+ I->getOperand(0).getReg() != X ||
+ I->getOperand(0).getSubReg() != Xsub) {
+ MovX = nullptr;
+ break;
}
+
+ if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
+ continue;
+
+ MovX = &*I;
}
- }
- if (MovY) {
+
+ if (!MovX)
+ continue;
+
LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY);
MachineBasicBlock &MBB = *MovT.getParent();
SmallVector<MachineInstr *, 4> Swaps;
-
- if (!InsertionPt)
- InsertionPt = MovY;
if (Size == 2) {
- auto *MIB = BuildMI(MBB, InsertionPt->getIterator(), MovT.getDebugLoc(),
+ auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B16))
.addDef(X)
.addDef(Y)
@@ -815,7 +804,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
TargetInstrInfo::RegSubRegPair X1, Y1;
X1 = getSubRegForIndex(X, Xsub, I);
Y1 = getSubRegForIndex(Y, Ysub, I);
- auto *MIB = BuildMI(MBB, InsertionPt->getIterator(), MovT.getDebugLoc(),
+ auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
TII->get(AMDGPU::V_SWAP_B32))
.addDef(X1.Reg, {}, X1.SubReg)
.addDef(Y1.Reg, {}, Y1.SubReg)
@@ -850,6 +839,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
return Next;
}
+
return nullptr;
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 01de2f36ec74d..fbc8b812d96c9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -5081,7 +5081,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX9_DPP-NEXT: v_subrev_u32_e32 v4, s12, v5
-; GFX9_DPP-NEXT: v_swap_b32 v3, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v4
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v5
; GFX9_DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9_DPP-NEXT: buffer_wbinvl1_vol
@@ -5152,7 +5153,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1064_DPP-NEXT: v_subrev_nc_u32_e32 v5, s12, v6
-; GFX1064_DPP-NEXT: v_swap_b32 v4, v5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v6
; GFX1064_DPP-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc
; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl1_inv
@@ -5214,7 +5216,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1032_DPP-NEXT: v_subrev_nc_u32_e32 v5, s9, v6
-; GFX1032_DPP-NEXT: v_swap_b32 v4, v5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v6
; GFX1032_DPP-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc
; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl1_inv
@@ -5297,7 +5300,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1164_DPP-NEXT: v_subrev_nc_u32_e32 v5, s12, v6
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_swap_b32 v4, v5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v6
; GFX1164_DPP-NEXT: buffer_atomic_cmpswap_b32 v[4:5], off, s[4:7], 0 glc
; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl1_inv
@@ -5370,7 +5374,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX1132_DPP-NEXT: v_subrev_nc_u32_e32 v5, s9, v6
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_swap_b32 v4, v5
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v5
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v6
; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b32 v[4:5], off, s[4:7], 0 glc
; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl1_inv
@@ -7403,8 +7408,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX9_DPP-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v10
; GFX9_DPP-NEXT: v_subb_co_u32_e32 v9, vcc, v11, v0, vcc
-; GFX9_DPP-NEXT: v_swap_b32 v6, v8
-; GFX9_DPP-NEXT: v_swap_b32 v7, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v8
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v9
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v10
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, v11
; GFX9_DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9_DPP-NEXT: buffer_wbinvl1_vol
@@ -7523,8 +7530,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX1064_DPP-NEXT: v_sub_co_u32 v10, vcc, v12, s8
; GFX1064_DPP-NEXT: v_subrev_co_ci_u32_e32 v11, vcc, s9, v13, vcc
-; GFX1064_DPP-NEXT: v_swap_b32 v8, v10
-; GFX1064_DPP-NEXT: v_swap_b32 v9, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v10
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v11
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v12
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v13
; GFX1064_DPP-NEXT: buffer_atomic_cmpswap_x2 v[8:11], off, s[4:7], 0 glc
; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl1_inv
@@ -7624,8 +7633,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032_DPP-NEXT: v_sub_co_u32 v11, vcc_lo, v13, s8
; GFX1032_DPP-NEXT: v_subrev_co_ci_u32_e32 v12, vcc_lo, s9, v14, vcc_lo
-; GFX1032_DPP-NEXT: v_swap_b32 v9, v11
-; GFX1032_DPP-NEXT: v_swap_b32 v10, v12
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v11
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v12
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v13
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v14
; GFX1032_DPP-NEXT: buffer_atomic_cmpswap_x2 v[9:12], off, s[4:7], 0 glc
; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl1_inv
@@ -7753,8 +7764,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_swap_b32 v6, v8
-; GFX1164_DPP-NEXT: v_swap_b32 v7, v9
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v9
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v10
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v11
; GFX1164_DPP-NEXT: buffer_atomic_cmpswap_b64 v[6:9], off, s[4:7], 0 glc
; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl1_inv
@@ -7860,9 +7873,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_sub_co_u32 v10, vcc_lo, v12, s8
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_subrev_co_ci_u32_e64 v11, null, s9, v13, vcc_lo
-; GFX1132_DPP-NEXT: v_swap_b32 v8, v10
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v10
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_swap_b32 v9, v11
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v13
; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b64 v[8:11], off, s[4:7], 0 glc
; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl1_inv
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
index 63e81ec9c5f11..55986328491ec 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -110,157 +110,3 @@ loop:
ret:
ret half %x
}
-
-; Another use of swap operands (i.e. %y) does not block swap generation.
-define half @swap_B(half %a, half %b, half %c, i32 %i) {
-; GFX11-TRUE16-LABEL: swap_B:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: .LBB1_1: ; %loop
-; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v0.l
-; GFX11-TRUE16-NEXT: ;;#ASMEND
-; GFX11-TRUE16-NEXT: v_swap_b16 v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v1.l
-; GFX11-TRUE16-NEXT: ;;#ASMEND
-; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v2.l
-; GFX11-TRUE16-NEXT: ;;#ASMEND
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
-; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-TRUE16-NEXT: ; %bb.2: ; %ret
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: swap_B:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: .LBB1_1: ; %loop
-; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v3, -1, v3
-; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v0
-; GFX11-FAKE16-NEXT: ;;#ASMEND
-; GFX11-FAKE16-NEXT: v_swap_b32 v1, v0
-; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v4
-; GFX11-FAKE16-NEXT: ;;#ASMEND
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v2
-; GFX11-FAKE16-NEXT: ;;#ASMEND
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-FAKE16-NEXT: ; %bb.2: ; %ret
-; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: swap_B:
-; GFX12-TRUE16: ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: .LBB1_1: ; %loop
-; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX12-TRUE16-NEXT: ;;#ASMSTART
-; GFX12-TRUE16-NEXT: ; use v0.l
-; GFX12-TRUE16-NEXT: ;;#ASMEND
-; GFX12-TRUE16-NEXT: v_swap_b16 v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT: ;;#ASMSTART
-; GFX12-TRUE16-NEXT: ; use v1.l
-; GFX12-TRUE16-NEXT: ;;#ASMEND
-; GFX12-TRUE16-NEXT: ;;#ASMSTART
-; GFX12-TRUE16-NEXT: ; use v2.l
-; GFX12-TRUE16-NEXT: ;;#ASMEND
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
-; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret
-; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: swap_B:
-; GFX12-FAKE16: ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: .LBB1_1: ; %loop
-; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v3, -1, v3
-; GFX12-FAKE16-NEXT: ;;#ASMSTART
-; GFX12-FAKE16-NEXT: ; use v0
-; GFX12-FAKE16-NEXT: ;;#ASMEND
-; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0
-; GFX12-FAKE16-NEXT: ;;#ASMSTART
-; GFX12-FAKE16-NEXT: ; use v4
-; GFX12-FAKE16-NEXT: ;;#ASMEND
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT: ;;#ASMSTART
-; GFX12-FAKE16-NEXT: ; use v2
-; GFX12-FAKE16-NEXT: ;;#ASMEND
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
-; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret
-; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
-entry:
- br label %loop
-
-loop:
- %t = phi half [%c, %entry], [%y, %loop]
- %x = phi half [%a, %entry], [%y, %loop]
- %y = phi half [%b, %entry], [%x, %loop]
-
- %i2 = phi i32 [%i, %entry], [%i3, %loop]
-
- call void asm sideeffect "; use $0", "v"(half %x)
- call void asm sideeffect "; use $0", "v"(half %y)
- call void asm sideeffect "; use $0", "v"(half %t)
-
- %i3 = sub i32 %i2, 1
-
- %cmp = icmp eq i32 %i3, 0
-
- br i1 %cmp, label %ret, label %loop
-
-ret:
- ret half %x
-}
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
index 43192d04a0e31..27229cd518028 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
@@ -27,8 +27,8 @@ body: |
# GCN-NEXT: {{^[ ]*$}}
# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
-# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
# GCN-NEXT: S_SETPC_B64_return
---
name: swap_phys_sparse
@@ -106,7 +106,8 @@ body: |
# GCN-NEXT: {{^[ ]*$}}
# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3_vgpr4 = V_ADD_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $mode, implicit $exec
-# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
---
name: swap_phys_overlap_x
tracksRegLiveness: true
@@ -220,8 +221,10 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
# GCN-NEXT: %3:vgpr_32 = COPY %0
-# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = COPY %2
# GCN-NEXT: S_ENDPGM 0
---
@@ -303,8 +306,10 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
-# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
# GCN-NEXT: S_ENDPGM 0
---
@@ -385,8 +390,10 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
# GCN-NEXT: %3:vreg_64 = COPY %0
-# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
---
name: swap_virt_copy_subreg_overlap_x_full
tracksRegLiveness: true
@@ -409,8 +416,10 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
# GCN-NEXT: %3:vreg_64 = COPY %0.sub0_sub1
-# GCN-NEXT: %0.sub0:vreg_128, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub0:vreg_128 = COPY %1.sub0
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
---
name: swap_virt_copy_subreg_overlap_x_part
tracksRegLiveness: true
@@ -573,7 +582,6 @@ body: |
# GCN-LABEL: name: swap_virt_copy_subreg_impdef_super
# GCN: %2:vreg_64 = IMPLICIT_DEF
# GCN-NEXT: %2.sub1:vreg_64 = COPY %0.sub1
-# GCN-NEXT: %0.sub1:vreg_64 = COPY %1.sub1
# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
---
name: swap_virt_copy_subreg_impdef_super
@@ -597,9 +605,11 @@ body: |
# GCN: bb.0:
# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
# GCN-NEXT: %2.sub1:vreg_64 = COPY %0.sub1
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0, implicit %0
# GCN-NEXT: %0.sub1:vreg_64 = COPY %1.sub1
-# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
# GCN-NEXT: S_ENDPGM 0
---
name: swap_virt_copy_subreg_impuse_x
@@ -777,10 +787,11 @@ body: |
...
# GCN-LABEL: name: swap_killed_t_early
-# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit killed $vgpr2
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
-# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 undef $vgpr2, implicit $exec
---
name: swap_killed_t_early
@@ -797,10 +808,11 @@ body: |
...
# GCN-LABEL: name: swap_killed_t_late
-# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit killed $vgpr2
-# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 undef $vgpr2, implicit $exec
---
name: swap_killed_t_late
@@ -817,10 +829,11 @@ body: |
...
# GCN-LABEL: name: swap_killed_x
-# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
-# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit killed $vgpr0
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
---
name: swap_killed_x
@@ -905,12 +918,13 @@ body: |
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
...
-# GCN-LABEL: name: implicit_ops_mov_x_swap_b64
-# GCN: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
-# GCN-NEXT: %0.sub1:vreg_64, %1.sub1:vreg_64 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+# GCN-LABEL: name: implict_ops_mov_x_swap_b64
+# GCN: %2:vreg_64 = COPY %0
+# GCN-NEXT: %0:vreg_64 = COPY %1, implicit $vgpr0
+# GCN-NEXT: %1:vreg_64 = COPY %2
---
-name: implicit_ops_mov_x_swap_b64
+name: implict_ops_mov_x_swap_b64
tracksRegLiveness: true
body: |
bb.0:
@@ -953,51 +967,3 @@ body: |
$vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $vgpr2, implicit-def $vgpr0_vgpr1, implicit killed $vgpr3
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
...
-
-################################################################
-# In:
-#
-# MOV T, X
-# MOV X, Y
-# MOV Y, T
-#
-# ensure that intervening uses/defs of T, X, Y that allow swap
-# semantics to be preserved will not block swap generation.
-################################################################
-
-# GCN-LABEL: name: swap_allow_use_def_and_flexible_insertion_point
-# GCN: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-# GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e32 $vgpr2, $vgpr0, implicit $exec
-# GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e32 $vgpr2, $vgpr1, implicit $exec
-# GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e32 1, $vgpr2, implicit $exec
-# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
-# GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e32 1, $vgpr0, implicit $exec
-# GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-
----
-name: swap_allow_use_def_and_flexible_insertion_point
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
- ; MOV T, X
- $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
- ; Writes to Y and reads of T, X, Y do not block swap generation
- $vgpr1 = V_LSHLREV_B32_e64 $vgpr2, $vgpr0, implicit $exec
- $vgpr1 = V_LSHLREV_B32_e64 $vgpr2, $vgpr1, implicit $exec
-
- ; MOV X, Y
- $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
-
- ; Read of T before insertion point does not block swap generation
- $vgpr3 = V_LSHLREV_B32_e64 1, $vgpr2, implicit $exec
- ; SWAP X, Y will be inserted here
- ; Read of X after insertion point does not block swap generatoin
- $vgpr3 = V_LSHLREV_B32_e64 1, $vgpr0, implicit $exec
-
- ; MOV Y, T
- $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec
-
- S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-...
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 2ebd37a7d3e1f..5b47988d72c47 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -5144,9 +5144,9 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GISEL-NEXT: scratch_store_b32 off, v247, s32 offset:572
; GISEL-NEXT: s_mov_b32 exec_lo, -1
; GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GISEL-NEXT: v_swap_b32 v0, v1
; GISEL-NEXT: s_mov_b32 s36, gfx_callee at abs32@lo
; GISEL-NEXT: s_mov_b32 s37, gfx_callee at abs32@hi
-; GISEL-NEXT: v_swap_b32 v0, v1
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload
@@ -5776,9 +5776,9 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GISEL64-NEXT: scratch_store_b32 off, v247, s32 offset:572
; GISEL64-NEXT: s_mov_b64 exec, -1
; GISEL64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL64-NEXT: v_swap_b32 v0, v1
; GISEL64-NEXT: s_mov_b32 s36, gfx_callee at abs32@lo
; GISEL64-NEXT: s_mov_b32 s37, gfx_callee at abs32@hi
-; GISEL64-NEXT: v_swap_b32 v0, v1
; GISEL64-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload
More information about the llvm-branch-commits
mailing list