[llvm] 2cfda6a - [AMDGPU] Fold immediates in the optimizeCompareInstr
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 2 17:23:45 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-09-02T17:23:26-07:00
New Revision: 2cfda6a6912e140bc02f569aa7992deed7b54ea9
URL: https://github.com/llvm/llvm-project/commit/2cfda6a6912e140bc02f569aa7992deed7b54ea9
DIFF: https://github.com/llvm/llvm-project/commit/2cfda6a6912e140bc02f569aa7992deed7b54ea9.diff
LOG: [AMDGPU] Fold immediates in the optimizeCompareInstr
Peephole works before the first SIFoldOperands so most of
the immediates are in registers.
Differential Revision: https://reviews.llvm.org/D109186
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/test/CodeGen/AMDGPU/basic-branch.ll
llvm/test/CodeGen/AMDGPU/optimize-compare.mir
llvm/test/CodeGen/AMDGPU/setcc.ll
llvm/test/CodeGen/AMDGPU/wave32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 64ba869ed2b9..5b3777a0bd7f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2701,7 +2701,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
}
}
-bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
+bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
@@ -3079,16 +3079,24 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
return false;
}
-static int64_t getFoldableImm(const MachineOperand* MO) {
+static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
+ int64_t &Imm) {
+ if (Reg.isPhysical())
+ return false;
+ auto *Def = MRI.getUniqueVRegDef(Reg);
+ if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
+ Imm = Def->getOperand(1).getImm();
+ return true;
+ }
+ return false;
+}
+
+static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) {
if (!MO->isReg())
return false;
const MachineFunction *MF = MO->getParent()->getParent()->getParent();
const MachineRegisterInfo &MRI = MF->getRegInfo();
- auto Def = MRI.getUniqueVRegDef(MO->getReg());
- if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
- Def->getOperand(1).isImm())
- return Def->getOperand(1).getImm();
- return AMDGPU::NoRegister;
+ return getFoldableImm(MO->getReg(), MRI, Imm);
}
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
@@ -3160,7 +3168,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
- if (auto Imm = getFoldableImm(Src2)) {
+ int64_t Imm;
+ if (getFoldableImm(Src2, Imm)) {
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3177,7 +3186,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
unsigned NewOpc = IsFMA
? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
- if (auto Imm = getFoldableImm(Src1)) {
+ if (getFoldableImm(Src1, Imm)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
@@ -3188,7 +3197,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
return MIB;
}
}
- if (auto Imm = getFoldableImm(Src0)) {
+ if (getFoldableImm(Src0, Imm)) {
if (pseudoToMCOpcode(NewOpc) != -1 &&
isOperandLegal(
MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
@@ -8004,7 +8013,10 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Register SrcReg2, int64_t CmpMask,
int64_t CmpValue,
const MachineRegisterInfo *MRI) const {
- if (SrcReg2 || SrcReg.isPhysical())
+ if (!SrcReg || SrcReg.isPhysical())
+ return false;
+
+ if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
@@ -8049,10 +8061,21 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Def->getOpcode() != AMDGPU::S_AND_B64)
return false;
+ const auto isMask = [](const MachineOperand *MO) -> bool {
+ int64_t Mask;
+ if (MO->isImm())
+ Mask = MO->getImm();
+ else if (!getFoldableImm(MO, Mask))
+ return false;
+ return Mask == 1;
+ };
+
MachineOperand *SrcOp = &Def->getOperand(1);
- if (SrcOp->isImm() && SrcOp->getImm() == 1)
+ if (isMask(SrcOp))
SrcOp = &Def->getOperand(2);
- else if (!Def->getOperand(2).isImm() || Def->getOperand(2).getImm() != 1)
+ else if (isMask(&Def->getOperand(2)))
+ SrcOp = &Def->getOperand(1);
+ else
return false;
Register DefReg = Def->getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2d5716c9bee6..4c405b15e049 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -331,7 +331,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
const MachineInstr &MIb) const override;
- bool isFoldableCopy(const MachineInstr &MI) const;
+ static bool isFoldableCopy(const MachineInstr &MI);
bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const final;
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index efcc2ae04e1f..b5c1510ee198 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -32,8 +32,8 @@ end:
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
-; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
-; GCN: s_cmp_eq_u32
+; GCNOPT: s_bitcmp0_b32 [[VAL]], 0
+; GCNNOOPT: s_cmp_eq_u32
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
; GCN: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index b38a2252e379..b77ab553d664 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -1340,3 +1340,105 @@ body: |
S_ENDPGM 0
...
+
+---
+name: and_1_folded_src0_cmp_eq_u32_1_folded_src2
+body: |
+ ; GCN-LABEL: name: and_1_folded_src0_cmp_eq_u32_1_folded_src2
+ ; GCN: bb.0:
+ ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GCN: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+ ; GCN: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN: S_BRANCH %bb.1
+ ; GCN: bb.1:
+ ; GCN: successors: %bb.2(0x80000000)
+ ; GCN: bb.2:
+ ; GCN: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_MOV_B32 1
+ %2:sreg_32 = S_AND_B32 %1, killed %0, implicit-def dead $scc
+ S_CMP_EQ_U32 killed %2:sreg_32, %1, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+name: and_1_folded_src1_cmp_eq_u32_1_folded_src2
+body: |
+ ; GCN-LABEL: name: and_1_folded_src1_cmp_eq_u32_1_folded_src2
+ ; GCN: bb.0:
+ ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GCN: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+ ; GCN: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN: S_BRANCH %bb.1
+ ; GCN: bb.1:
+ ; GCN: successors: %bb.2(0x80000000)
+ ; GCN: bb.2:
+ ; GCN: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_MOV_B32 1
+ %2:sreg_32 = S_AND_B32 killed %0, %1, implicit-def dead $scc
+ S_CMP_EQ_U32 killed %2:sreg_32, %1, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+name: and_1_folded_src1_cmp_eq_u64_1_folded_src2
+body: |
+ ; GCN-LABEL: name: and_1_folded_src1_cmp_eq_u64_1_folded_src2
+ ; GCN: bb.0:
+ ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+ ; GCN: S_BITCMP1_B64 killed [[COPY]], 0, implicit-def $scc
+ ; GCN: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN: S_BRANCH %bb.1
+ ; GCN: bb.1:
+ ; GCN: successors: %bb.2(0x80000000)
+ ; GCN: bb.2:
+ ; GCN: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+
+ %0:sreg_64 = COPY $sgpr0_sgpr1
+ %1:sreg_64 = S_MOV_B64 1
+ %2:sreg_64 = S_AND_B64 killed %0, %1, implicit-def dead $scc
+ S_CMP_EQ_U64 killed %2:sreg_64, %1, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index 7d41f09decd7..231e8523eafe 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -380,8 +380,7 @@ define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
; Make sure we don't try to emit i1 setcc ops
; FUNC-LABEL: setcc-i1
-; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
-; GCN: s_cmp_eq_u32 [[AND]], 0
+; GCN: s_bitcmp0_b32 s{{[0-9]+}}, 0
define amdgpu_kernel void @setcc-i1(i32 %in) #0 {
%and = and i32 %in, 1
%cmp = icmp eq i32 %and, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index a9fc03a6874d..c0905736b8a9 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -521,7 +521,7 @@ two:
}
; GCN-LABEL: {{^}}test_brcc_i1:
-; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0
+; GCN: s_bitcmp0_b32 s{{[0-9]+}}, 0
; GCN-NEXT: s_cbranch_scc1
define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
%cmp0 = icmp ne i1 %val, 0
More information about the llvm-commits
mailing list