[llvm] [AMDGPU] Promote uniform ops to I32 in ISel (PR #106383)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 05:55:38 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
See #<!-- -->106382 for NFC test updates.
Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP.
Solves #<!-- -->64591
---
Patch is 1.35 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106383.diff
88 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+1-1)
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-9)
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+6-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+4-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+27-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+33-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+1-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp (+113)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+149-7)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1-1)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-1)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll (+33-37)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (+60-54)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+100-63)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+72-48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+78-52)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll (+442-412)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+107-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+15-62)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll (+60-54)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll (+68-101)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (+6-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll (+49-39)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll (+25-29)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll (+4-22)
- (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (+2-650)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/anyext.ll (+2-6)
- (modified) llvm/test/CodeGen/AMDGPU/bitreverse.ll (+2-5)
- (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll (+2-62)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+900-839)
- (modified) llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll (+4-7)
- (modified) llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+5-21)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+14-11)
- (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+3-11)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+14-24)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-select.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+1010-309)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (+539-119)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+51-50)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+3-10)
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll (-532)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+21-12)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+235-228)
- (modified) llvm/test/CodeGen/AMDGPU/imm16.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+57-49)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+910-993)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+74-86)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+3212-3431)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+2431-2404)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+5-10)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i8.ll (+5-10)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+225-170)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+27-24)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+44-29)
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+108-119)
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll (+21-14)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+100-99)
- (modified) llvm/test/CodeGen/AMDGPU/select-i1.ll (+4-9)
- (modified) llvm/test/CodeGen/AMDGPU/select-vectors.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/setcc-opt.ll (+5-12)
- (modified) llvm/test/CodeGen/AMDGPU/sext-in-reg.ll (+4-10)
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/shl.v2i16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/sign_extend.ll (+9-10)
- (modified) llvm/test/CodeGen/AMDGPU/smed3.ll (+17-3)
- (modified) llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll (+1013-83)
- (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+19-17)
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+16-18)
- (modified) llvm/test/CodeGen/AMDGPU/trunc-combine.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/trunc-store.ll (+80-56)
- (modified) llvm/test/CodeGen/AMDGPU/uaddo.ll (+9-6)
- (modified) llvm/test/CodeGen/AMDGPU/usubo.ll (+9-6)
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+11-10)
- (modified) llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+35-26)
- (modified) llvm/test/CodeGen/AMDGPU/zero_extend.ll (+6-5)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..85310a4911b8ed 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
/// Return true if it's profitable to narrow operations of type SrcVT to
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
/// i32 to i16.
- virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+ virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..513ad392cb360a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
- TLI.isNarrowingProfitable(VT, SrcVT))
+ TLI.isNarrowingProfitable(N, VT, SrcVT))
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
unsigned ShLeftAmt = 0;
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
- ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+ ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
ShLeftAmt = N01->getZExtValue();
N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
- if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
- if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
- TLI.isTruncateFree(SrcVT, VT)) {
+ if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+ TLI.isTruncateFree(SrcVT, VT)) {
+ if (!LegalOperations ||
+ (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+ TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
SDLoc SL(N0);
SDValue Cond = N0.getOperand(0);
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
- while (NewBW < BitWidth &&
- (NewVT.getStoreSizeInBits() != NewBW ||
- !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
- !TLI.isNarrowingProfitable(VT, NewVT))) {
+ while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+ !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+ !TLI.isNarrowingProfitable(N, VT, NewVT))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..97e10b3551db1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
- if (isNarrowingProfitable(VT, SmallVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
DemandedBits.countLeadingOnes() >= HalfWidth) {
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector()) {
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
case ISD::SETULT:
case ISD::SETULE: {
EVT newVT = N0.getOperand(0).getValueType();
+ // FIXME: Should use isNarrowingProfitable.
if (DCI.isBeforeLegalizeOps() ||
(isOperationLegal(ISD::SETCC, newVT) &&
- isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+ isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+ isTypeDesirableForOp(ISD::SETCC, newVT))) {
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..f689fcf62fe8eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
cl::init(false));
static cl::opt<bool> Widen16BitOps(
- "amdgpu-codegenprepare-widen-16-bit-ops",
- cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
- cl::ReallyHidden,
- cl::init(true));
+ "amdgpu-codegenprepare-widen-16-bit-ops",
+ cl::desc(
+ "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(false));
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..01e96159babd03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+ (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ICMP, G_SELECT],
+ (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96143d688801aa..1a596cc80c0c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
return Src == MVT::i32 && Dest == MVT::i64;
}
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+ EVT DestVT) const {
+ switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::MUL:
+ case ISD::SETCC:
+ case ISD::SELECT:
+ if (Subtarget->has16BitInsts() &&
+ (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+ // Don't narrow back down to i16 if promoted to i32 already.
+ if (!N->isDivergent() && DestVT.isInteger() &&
+ DestVT.getScalarSizeInBits() > 1 &&
+ DestVT.getScalarSizeInBits() <= 16 &&
+ SrcVT.getScalarSizeInBits() > 16) {
+ return false;
+ }
+ }
+ return true;
+ default:
+ break;
+ }
+
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
// in a single 32-bit register should always be helpful. As currently used,
// this is much less general than the name suggests, and is only used in
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
// not profitable, and may actually be harmful.
- return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+ if (isa<LoadSDNode>(N))
+ return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+ return true;
}
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..4dfa7ac052a5ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
NegatibleCost &Cost,
unsigned Depth) const override;
- bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+ bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..3b4faa35b93738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ bool matchPromote16to32(MachineInstr &MI) const;
+ void applyPromote16to32(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
return false;
}
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ const auto *RB = MRI.getRegBankOrNull(Dst);
+
+ // Only promote uniform instructions.
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ return STI.has16BitInsts() &&
+ (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_ASHR:
+ return AMDGPU::G_SEXT;
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_FSHR:
+ return AMDGPU::G_ZEXT;
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ case AMDGPU::G_SELECT:
+ case AMDGPU::G_MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return AMDGPU::G_ANYEXT;
+ case AMDGPU::G_ICMP: {
+ return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+ : AMDGPU::G_ZEXT;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+ Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+ Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+ Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+ Opc == AMDGPU::G_ICMP);
+
+ Register Dst = MI.getOperand(0).getReg();
+
+ bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+ Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+ Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+ assert(MRI.getType(Dst) == LLT::scalar(16));
+ assert(MRI.getType(LHS) == LLT::scalar(16));
+ assert(MRI.getType(RHS) == LLT::scalar(16));
+
+ assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+ const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+ LLT S32 = LLT::scalar(32);
+
+ B.setInstrAndDebugLoc(MI);
+ const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+ LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+ RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+ MRI.setRegBank(LHS, RB);
+ MRI.setRegBank(RHS, RB);
+
+ MachineInstr *NewInst;
+ if (IsSelectOrCmp)
+ NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+ else
+ NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+ if (Opc != AMDGPU::G_ICMP) {
+ Register Dst32 = NewInst->getOperand(0).getReg();
+ MRI.setRegBank(Dst32, RB);
+ B.buildTrunc(Dst, Dst32);
+ }
+
+ switch (Opc) {
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SHL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_SUB:
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_MUL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ break;
+ }
+
+ MI.eraseFromParent();
+}
+
void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
Register &Reg) const {
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..96a59acd751a62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
+ ISD::MUL,
ISD::FADD,
ISD::FSUB,
ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UMIN,
ISD::UMAX,
ISD::SETCC,
+ ISD::SELECT,
+ ISD::SMIN,
+ ISD::SMAX,
+ ISD::UMIN,
+ ISD::UMAX,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::SHL,
+ ISD::SRL,
+ ISD::SRA,
ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
-
- // These operations are done with 32-bit instructions anyway.
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- case ISD::SELECT:
- // TODO: Extensions?
return true;
default:
return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
}
+static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
+ switch (Op->getOpcode()) {
+ case ISD::SRA:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ return ISD::SIGN_EXTEND;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SRL:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ return ISD::ZERO_EXTEND;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SHL:
+ case ISD::SELECT:
+ case ISD::MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return ISD::ANY_EXTEND;
+ case ISD::SETCC: {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
+ DAGCombinerInfo &DCI) const {
+ const unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
+ Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
+ Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
+ Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
+ Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
+
+ EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+ : Op->getOperand(0).getValueType();
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ if (!Subtarget->has16BitInsts() ||
+ (OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
+ return SDValue();
+
+ // Promote uniform scalar and vector integers between 2 and 16 bits.
+ if (Op->isDivergent() || !OpTy.isInteger() ||
+ OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
+ return SDValue();
+
+ auto &DAG = DCI.DAG;
+
+ SDLoc DL(Op);
+ SDValue LHS;
+ SDValue RHS;
+ if (Opc == ISD::SELECT) {
+ LHS = Op->getOperand(1);
+ RHS = Op->getOperand(2);
+ } else {
+ LHS = Op->getOperand(0)...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/106383
More information about the llvm-commits
mailing list