[llvm] [LLVM] Make use of s_flbit_i32_b64 and s_ff1_i32_b64 (PR #75158)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 12 02:14:23 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Acim Maravic (Acim-Maravic)
<details>
<summary>Changes</summary>
Update DAG ISel to support 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B664
---
Patch is 176.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75158.diff
21 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+11)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+65)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+1)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+21-60)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+26-52)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+75-216)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+14-40)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+14-40)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+30-46)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+787-16)
- (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+21-31)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+24-27)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+30-75)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+12-39)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+12-39)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+30-75)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll (+4-16)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+5-20)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+35-56)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+28-46)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+32-44)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1fc..7c0947945c9ad0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3067,6 +3067,17 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
return NewOpr;
}
+ // 64-bit scalar ctlz/cttz should use S_FLBIT_I32_B64/S_FF1_I32_B64
+ // 64-bit scalar version produce 32-bit result
+ if (!(Src->isDivergent()) && Src.getValueType() == MVT::i64) {
+ SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
+ if (!ZeroUndef) {
+ const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
+ NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+ }
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
+ }
+
SDValue Lo, Hi;
std::tie(Lo, Hi) = split64BitValue(Src, DAG);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0a06fa88b6b102..88a95ba9b00107 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6849,6 +6849,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
+ case AMDGPU::S_FLBIT_I32_B64:
+ splitScalar64BitCountOp(Worklist, Inst, AMDGPU::S_FLBIT_I32_B32);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_FF1_I32_B64:
+ splitScalar64BitCountOp(Worklist, Inst, AMDGPU::S_FF1_I32_B32);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_LSHL_B32:
if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
@@ -7834,6 +7843,62 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Instr, unsigned Opcode, MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Instr.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Instr;
+ const DebugLoc &DL = Instr.getDebugLoc();
+
+ MachineOperand &Dest = Instr.getOperand(0);
+ MachineOperand &Src = Instr.getOperand(1);
+
+ const MCInstrDesc &InstDesc = get(Opcode);
+ bool isCtlz = Opcode == AMDGPU::S_FLBIT_I32_B32;
+
+
+ const TargetRegisterClass *SrcRC = Src.isReg() ?
+ MRI.getRegClass(Src.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ const TargetRegisterClass *SrcSubRC =
+ RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub0, SrcSubRC);
+ MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub1, SrcSubRC);
+
+ Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ MachineInstr *M1 = BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
+ MachineInstr *M2 = BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
+ MachineInstr *M3 = nullptr;
+ MachineInstr *M4 = nullptr;
+
+ if (isCtlz)
+ M3 = BuildMI(MBB, MII, DL, get(AMDGPU::S_ADD_I32), MidReg3).addReg(MidReg1).addImm(32);
+ else
+ M3 = BuildMI(MBB, MII, DL, get(AMDGPU::S_ADD_I32), MidReg3).addReg(MidReg2).addImm(32);
+
+ if (isCtlz)
+ M4 = BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN3_U32_e64), MidReg4).addReg(MidReg3).addReg(MidReg2).addImm(64);
+ else
+ M4 = BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN3_U32_e64), MidReg4).addReg(MidReg3).addReg(MidReg1).addImm(64);
+
+ MRI.replaceRegWith(Dest.getReg(), MidReg4);
+
+ Worklist.insert(M1);
+ Worklist.insert(M2);
+ Worklist.insert(M3);
+ Worklist.insert(M4);
+
+ addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
Register DstReg, MachineRegisterInfo &MRI,
SIInstrWorklist &Worklist) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0ce31ac6d54ec5..effd891eb26edc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -147,6 +147,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
MachineInstr &Inst) const;
void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Instr, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const;
void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5a..12574c41f7cd31 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -458,13 +458,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s5, s3
-; GFX8-NEXT: s_ff1_i32_b32 s6, s2
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s5, s6, s5
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -502,13 +499,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s5, s3
-; GFX9-NEXT: s_ff1_i32_b32 s6, s2
-; GFX9-NEXT: s_add_i32 s5, s5, 32
-; GFX9-NEXT: s_min_u32 s5, s6, s5
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -545,10 +539,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT: s_add_i32 s5, s5, 32
-; GFX10W64-NEXT: s_min_u32 s5, s6, s5
+; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -627,16 +618,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT: s_add_i32 s5, s5, 32
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_min_u32 s5, s6, s5
+; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -739,13 +726,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s5, s3
-; GFX8-NEXT: s_ff1_i32_b32 s6, s2
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s5, s6, s5
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -785,13 +769,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s5, s3
-; GFX9-NEXT: s_ff1_i32_b32 s6, s2
-; GFX9-NEXT: s_add_i32 s5, s5, 32
-; GFX9-NEXT: s_min_u32 s5, s6, s5
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -830,10 +811,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT: s_add_i32 s5, s5, 32
-; GFX10W64-NEXT: s_min_u32 s5, s6, s5
+; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -918,16 +896,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT: s_add_i32 s5, s5, 32
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_min_u32 s5, s6, s5
+; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1540,13 +1514,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB7_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s5, s3
-; GFX8-NEXT: s_ff1_i32_b32 s6, s2
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s5, s6, s5
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1584,13 +1555,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s5, s3
-; GFX9-NEXT: s_ff1_i32_b32 s6, s2
-; GFX9-NEXT: s_add_i32 s5, s5, 32
-; GFX9-NEXT: s_min_u32 s5, s6, s5
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1627,10 +1595,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT: s_add_i32 s5, s5, 32
-; GFX10W64-NEXT: s_min_u32 s5, s6, s5
+; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -1709,16 +1674,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT: s_add_i32 s5, s5, 32
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_min_u32 s5, s6, s5
+; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f8..d64112e6972de3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -525,15 +525,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s4, s3
-; GFX8-NEXT: s_ff1_i32_b32 s5, s2
-; GFX8-NEXT: s_add_i32 s4, s4, 32
-; GFX8-NEXT: s_min_u32 s7, s5, s4
-; GFX8-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT: s_mov_b32 m0, s7
+; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s4
+; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s8
+; GFX8-NEXT: s_add_i32 s6, s6, s7
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
@@ -574,15 +571,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s4, s3
-; GFX9-NEXT: s_ff1_i32_b32 s5, s2
-; GFX9-NEXT: s_add_i32 s4, s4, 32
-; GFX9-NEXT: s_min_u32 s7, s5, s4
-; GFX9-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT: s_mov_b32 m0, s7
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s4
+; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s8
+; GFX9-NEXT: s_add_i32 s6, s6, s7
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
@@ -623,10 +617,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT: s_ff1_i32_b32 s5, s2
-; GFX1064-NEXT: s_add_i32 s4, s4, 32
-; GFX1064-NEXT: s_min_u32 s7, s5, s4
+; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
@@ -721,16 +712,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1164-NEXT: s_ctz_i32_b32 s5, s2
-; GFX1164-NEXT: s_add_i32 s4, s4, 32
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_min_u32 s7, s5, s4
+; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_add_i32 s6, s6, s8
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
@@ -2038,15 +2025,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s4, s3
-; GFX8-NEXT: s_ff1_i32_b32 s5, s2
-; GFX8-NEXT: s_add_i32 s4, s4, 32
-; GFX8-NEXT: s_min_u32 s7, s5, s4
-; GFX8-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT: s_mov_b32 m0, s7
+; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s4
+; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s8
+; GFX8-NEXT: s_add_i32 s6, s6, s7
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
@@ -2087,15 +2071,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s4, s3
-; GFX9-NEXT: s_ff1_i32_b32 s5, s2
-; GFX9-NEXT: s_add_i32 s4, s4, 32
-; GFX9-NEXT: s_min_u32 s7, s5, s4
-; GFX9-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT: s_mov_b32 m0, s7
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s4
+; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s8
+; GFX9-NEXT: s_add_i32 s6, s6, s7
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
@@ -2136,10 +2117,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT: s_ff1_i32_b32 s5, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75158
More information about the llvm-commits
mailing list