[llvm] [AMDGPU] Implement moveToVALU for S_CSELECT_B64 (PR #70352)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 26 09:50:31 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
moveToVALU previously only handled S_CSELECT_B64 in the trivial case
where it was semantically equivalent to a copy. Implement the general
case using V_CNDMASK_B64_PSEUDO and implement post-RA expansion of
V_CNDMASK_B64_PSEUDO with immediate as well as register operands.
---
Full diff: https://github.com/llvm/llvm-project/pull/70352.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+34-12)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+33-26)
- (added) llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll (+34)
- (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir (+19)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7d457edad0d5cdf..20bae0f27647a42 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4727,8 +4727,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
+ const MachineOperand &Src0 = MI.getOperand(1);
+ const MachineOperand &Src1 = MI.getOperand(2);
const DebugLoc &DL = MI.getDebugLoc();
Register SrcCond = MI.getOperand(3).getReg();
@@ -4737,20 +4737,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
+ const TargetRegisterClass *Src0RC = Src0.isReg()
+ ? MRI.getRegClass(Src0.getReg())
+ : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *Src1RC = Src1.isReg()
+ ? MRI.getRegClass(Src1.getReg())
+ : &AMDGPU::VReg_64RegClass;
+
+ const TargetRegisterClass *Src0SubRC =
+ TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub0)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub0)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub0)
+ .addImm(0)
+ .add(Src1Sub0)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub1)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub1)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub1)
+ .addImm(0)
+ .add(Src1Sub1)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 327f8988ac2f105..f1d1fd386d1b2cc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7223,27 +7223,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineOperand &Src1 = Inst.getOperand(2);
MachineOperand &Cond = Inst.getOperand(3);
- Register SCCSource = Cond.getReg();
- bool IsSCC = (SCCSource == AMDGPU::SCC);
+ Register CondReg = Cond.getReg();
+ bool IsSCC = (CondReg == AMDGPU::SCC);
// If this is a trivial select where the condition is effectively not SCC
- // (SCCSource is a source of copy to SCC), then the select is semantically
- // equivalent to copying SCCSource. Hence, there is no need to create
+ // (CondReg is a source of copy to SCC), then the select is semantically
+ // equivalent to copying CondReg. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
(Src1.getImm() == 0)) {
- MRI.replaceRegWith(Dest.getReg(), SCCSource);
+ MRI.replaceRegWith(Dest.getReg(), CondReg);
return;
}
- const TargetRegisterClass *TC =
- RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
- Register CopySCC = MRI.createVirtualRegister(TC);
-
+ Register NewCondReg = CondReg;
if (IsSCC) {
+ const TargetRegisterClass *TC =
+ RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ NewCondReg = MRI.createVirtualRegister(TC);
+
// Now look for the closest SCC def if it is a copy
- // replacing the SCCSource with the COPY source register
+ // replacing the CondReg with the COPY source register
bool CopyFound = false;
for (MachineInstr &CandI :
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
@@ -7251,7 +7251,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-1) {
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
- BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+ BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
.addReg(CandI.getOperand(1).getReg());
CopyFound = true;
}
@@ -7266,24 +7266,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
- BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+ BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
}
}
- Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- auto UpdatedInst =
- BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
- .addImm(0)
- .add(Src1) // False
- .addImm(0)
- .add(Src0) // True
- .addReg(IsSCC ? CopySCC : SCCSource);
-
- MRI.replaceRegWith(Dest.getReg(), ResultReg);
- legalizeOperands(*UpdatedInst, MDT);
- addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ Register NewDestReg = MRI.createVirtualRegister(
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
+ MachineInstr *NewInst;
+ if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
+ NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
+ .addImm(0)
+ .add(Src1) // False
+ .addImm(0)
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ } else {
+ NewInst =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
+ .add(Src1) // False
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ }
+ MRI.replaceRegWith(Dest.getReg(), NewDestReg);
+ legalizeOperands(*NewInst, MDT);
+ addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
}
void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
new file mode 100644
index 000000000000000..ac196635b363a41
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s
+
+define amdgpu_cs <2 x i32> @f() {
+; CHECK-LABEL: f:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: s_mov_b32 s2, s0
+; CHECK-NEXT: s_mov_b32 s3, s0
+; CHECK-NEXT: s_mov_b32 s4, s0
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CHECK-NEXT: s_mov_b32 s5, s0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; CHECK-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; CHECK-NEXT: ; return to shader part epilog
+bb:
+ %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+ %i1 = bitcast <2 x i32> %i to i64
+ %i2 = insertelement <3 x i64> zeroinitializer, i64 %i1, i64 2
+ %i3 = icmp ne <3 x i64> %i2, zeroinitializer
+ %i4 = zext <3 x i1> %i3 to <3 x i64>
+ %i5 = bitcast <3 x i64> %i4 to <6 x i32>
+ %i6 = shufflevector <6 x i32> %i5, <6 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
+ call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %i6, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+ ret <2 x i32> %i6
+}
+
+declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32 immarg)
+declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 8d0a9899b5dbcc7..b1715bd9d332864 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -233,3 +233,22 @@ body: |
%0:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
%1:sreg_32 = COPY %0.sub0
...
+
+---
+# GCN-LABEL: name: s_cselect_b64
+# GCN: %0:vgpr_32 = IMPLICIT_DEF
+# GCN: %1:vreg_64 = IMPLICIT_DEF
+# GCN: %2:sreg_32 = IMPLICIT_DEF
+# GCN: %3:sreg_64 = IMPLICIT_DEF
+# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
+# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec
+name: s_cselect_b64
+body: |
+ bb.0:
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ %3:sreg_64 = COPY %1
+ S_CMP_EQ_U32 %2, 0, implicit-def $scc
+ %4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/70352
More information about the llvm-commits
mailing list