[llvm] [AMDGPU] Promote uniform ops to I32 in ISel (PR #106383)

Wed Aug 28 05:55:38 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

See #106382 for NFC test updates.

Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP.

Solves #64591

---

Patch is 1.35 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106383.diff


88 Files Affected:

- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+1-1) 
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-9) 
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+6-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+4-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+27-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+33-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+1-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp (+113) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+149-7) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1-1) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-1) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll (+33-37) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (+60-54) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+100-63) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+72-48) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+78-52) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll (+442-412) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+107-42) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+15-62) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll (+60-54) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll (+68-101) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (+6-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll (+49-39) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll (+25-29) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll (+4-22) 
- (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+11-11) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll (+3-4) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (+2-650) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/anyext.ll (+2-6) 
- (modified) llvm/test/CodeGen/AMDGPU/bitreverse.ll (+2-5) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll (+2-62) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+900-839) 
- (modified) llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll (+4-7) 
- (modified) llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+5-21) 
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+14-11) 
- (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+3-11) 
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+14-24) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-select.ll (+2-3) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+1010-309) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (+539-119) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+51-50) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+3-10) 
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll (-532) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+21-12) 
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+235-228) 
- (modified) llvm/test/CodeGen/AMDGPU/imm16.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+57-49) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+910-993) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+74-86) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+20-20) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+3212-3431) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+2431-2404) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+5-10) 
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i8.ll (+5-10) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+225-170) 
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+27-24) 
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+44-29) 
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+108-119) 
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll (+21-14) 
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+100-99) 
- (modified) llvm/test/CodeGen/AMDGPU/select-i1.ll (+4-9) 
- (modified) llvm/test/CodeGen/AMDGPU/select-vectors.ll (+2-3) 
- (modified) llvm/test/CodeGen/AMDGPU/setcc-opt.ll (+5-12) 
- (modified) llvm/test/CodeGen/AMDGPU/sext-in-reg.ll (+4-10) 
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/shl.v2i16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/sign_extend.ll (+9-10) 
- (modified) llvm/test/CodeGen/AMDGPU/smed3.ll (+17-3) 
- (modified) llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll (+1013-83) 
- (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+40-40) 
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+19-17) 
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+16-18) 
- (modified) llvm/test/CodeGen/AMDGPU/trunc-combine.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/trunc-store.ll (+80-56) 
- (modified) llvm/test/CodeGen/AMDGPU/uaddo.ll (+9-6) 
- (modified) llvm/test/CodeGen/AMDGPU/usubo.ll (+9-6) 
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+11-10) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+35-26) 
- (modified) llvm/test/CodeGen/AMDGPU/zero_extend.ll (+6-5) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..85310a4911b8ed 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
   /// Return true if it's profitable to narrow operations of type SrcVT to
   /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
   /// i32 to i16.
-  virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..513ad392cb360a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
         TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
-        TLI.isNarrowingProfitable(VT, SrcVT))
+        TLI.isNarrowingProfitable(N, VT, SrcVT))
       return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
                          DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
                                      DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   // ShLeftAmt will indicate how much a narrowed load should be shifted left.
   unsigned ShLeftAmt = 0;
   if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
-      ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+      ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
     if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       ShLeftAmt = N01->getZExtValue();
       N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   }
 
   // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
-  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
-    if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
-        TLI.isTruncateFree(SrcVT, VT)) {
+  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+      TLI.isTruncateFree(SrcVT, VT)) {
+    if (!LegalOperations ||
+        (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+         TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
       SDLoc SL(N0);
       SDValue Cond = N0.getOperand(0);
       SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
-    while (NewBW < BitWidth &&
-           (NewVT.getStoreSizeInBits() != NewBW ||
-            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
-            !TLI.isNarrowingProfitable(VT, NewVT))) {
+    while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+                                !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+                                !TLI.isNarrowingProfitable(N, VT, NewVT))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..97e10b3551db1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
         for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
              SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
           EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
-          if (isNarrowingProfitable(VT, SmallVT) &&
+          if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
               isTypeDesirableForOp(ISD::SHL, SmallVT) &&
               isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
               (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
           DemandedBits.countLeadingOnes() >= HalfWidth) {
         EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
-        if (isNarrowingProfitable(VT, HalfVT) &&
+        if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
             isTypeDesirableForOp(ISD::SHL, HalfVT) &&
             isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
             (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((BitWidth % 2) == 0 && !VT.isVector()) {
         APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
         EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
-        if (isNarrowingProfitable(VT, HalfVT) &&
+        if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
             isTypeDesirableForOp(ISD::SRL, HalfVT) &&
             isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
             (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       case ISD::SETULT:
       case ISD::SETULE: {
         EVT newVT = N0.getOperand(0).getValueType();
+        // FIXME: Should use isNarrowingProfitable.
         if (DCI.isBeforeLegalizeOps() ||
             (isOperationLegal(ISD::SETCC, newVT) &&
-             isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+             isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+             isTypeDesirableForOp(ISD::SETCC, newVT))) {
           EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
           SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..f689fcf62fe8eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
   cl::init(false));
 
 static cl::opt<bool> Widen16BitOps(
-  "amdgpu-codegenprepare-widen-16-bit-ops",
-  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
-  cl::ReallyHidden,
-  cl::init(true));
+    "amdgpu-codegenprepare-widen-16-bit-ops",
+    cl::desc(
+        "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+    cl::ReallyHidden, cl::init(false));
 
 static cl::opt<bool>
     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..01e96159babd03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
 
 } // End Predicates = [NotHasMed3_16]
 
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+          (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+  (defs root:$dst),
+  (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+    [{ return matchPromote16to32(*${mi}); }]),
+  (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op, [G_ICMP, G_SELECT],
+          (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+  (defs root:$dst),
+  (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+    [{ return matchPromote16to32(*${mi}); }]),
+  (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
 // Combines which should only apply on SI/CI
 def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
 
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
-   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+   promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96143d688801aa..1a596cc80c0c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
   return Src == MVT::i32 && Dest == MVT::i64;
 }
 
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+                                                 EVT DestVT) const {
+  switch (N->getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::MUL:
+  case ISD::SETCC:
+  case ISD::SELECT:
+    if (Subtarget->has16BitInsts() &&
+        (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+      // Don't narrow back down to i16 if promoted to i32 already.
+      if (!N->isDivergent() && DestVT.isInteger() &&
+          DestVT.getScalarSizeInBits() > 1 &&
+          DestVT.getScalarSizeInBits() <= 16 &&
+          SrcVT.getScalarSizeInBits() > 16) {
+        return false;
+      }
+    }
+    return true;
+  default:
+    break;
+  }
+
   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
   // limited number of native 64-bit operations. Shrinking an operation to fit
   // in a single 32-bit register should always be helpful. As currently used,
   // this is much less general than the name suggests, and is only used in
   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
   // not profitable, and may actually be harmful.
-  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+  if (isa<LoadSDNode>(N))
+    return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+  return true;
 }
 
 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..4dfa7ac052a5ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
                                NegatibleCost &Cost,
                                unsigned Depth) const override;
 
-  bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+  bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
 
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..3b4faa35b93738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
   void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
   void applyClamp(MachineInstr &MI, Register &Reg) const;
 
+  bool matchPromote16to32(MachineInstr &MI) const;
+  void applyPromote16to32(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
   return false;
 }
 
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  const auto *RB = MRI.getRegBankOrNull(Dst);
+
+  // Only promote uniform instructions.
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+    return false;
+
+  // Promote only if:
+  //    - We have 16 bit insts (not true 16 bit insts).
+  //    - We don't have packed instructions (for vector types only).
+  // TODO: For vector types, the set of packed operations is more limited, so
+  // may want to promote some anyway.
+  return STI.has16BitInsts() &&
+         (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::G_ASHR:
+    return AMDGPU::G_SEXT;
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SUB:
+  case AMDGPU::G_FSHR:
+    return AMDGPU::G_ZEXT;
+  case AMDGPU::G_AND:
+  case AMDGPU::G_OR:
+  case AMDGPU::G_XOR:
+  case AMDGPU::G_SHL:
+  case AMDGPU::G_SELECT:
+  case AMDGPU::G_MUL:
+    // operation result won't be influenced by garbage high bits.
+    // TODO: are all of those cases correct, and are there more?
+    return AMDGPU::G_ANYEXT;
+  case AMDGPU::G_ICMP: {
+    return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+                                                        : AMDGPU::G_ZEXT;
+  }
+  default:
+    llvm_unreachable("unexpected opcode!");
+  }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+         Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+         Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+         Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+         Opc == AMDGPU::G_ICMP);
+
+  Register Dst = MI.getOperand(0).getReg();
+
+  bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+  Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+  Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+  assert(MRI.getType(Dst) == LLT::scalar(16));
+  assert(MRI.getType(LHS) == LLT::scalar(16));
+  assert(MRI.getType(RHS) == LLT::scalar(16));
+
+  assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+  assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+  assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+  const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+  LLT S32 = LLT::scalar(32);
+
+  B.setInstrAndDebugLoc(MI);
+  const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+  LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+  RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+  MRI.setRegBank(LHS, RB);
+  MRI.setRegBank(RHS, RB);
+
+  MachineInstr *NewInst;
+  if (IsSelectOrCmp)
+    NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+  else
+    NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+  if (Opc != AMDGPU::G_ICMP) {
+    Register Dst32 = NewInst->getOperand(0).getReg();
+    MRI.setRegBank(Dst32, RB);
+    B.buildTrunc(Dst, Dst32);
+  }
+
+  switch (Opc) {
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SHL:
+    NewInst->setFlag(MachineInstr::NoUWrap);
+    NewInst->setFlag(MachineInstr::NoSWrap);
+    break;
+  case AMDGPU::G_SUB:
+    if (MI.getFlag(MachineInstr::NoUWrap))
+      NewInst->setFlag(MachineInstr::NoUWrap);
+    NewInst->setFlag(MachineInstr::NoSWrap);
+    break;
+  case AMDGPU::G_MUL:
+    NewInst->setFlag(MachineInstr::NoUWrap);
+    if (MI.getFlag(MachineInstr::NoUWrap))
+      NewInst->setFlag(MachineInstr::NoUWrap);
+    break;
+  }
+
+  MI.eraseFromParent();
+}
+
 void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
                                            Register &Reg) const {
   B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..96a59acd751a62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
                        ISD::USUBO_CARRY,
+                       ISD::MUL,
                        ISD::FADD,
                        ISD::FSUB,
                        ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::UMIN,
                        ISD::UMAX,
                        ISD::SETCC,
+                       ISD::SELECT,
+                       ISD::SMIN,
+                       ISD::SMAX,
+                       ISD::UMIN,
+                       ISD::UMAX,
                        ISD::AND,
                        ISD::OR,
                        ISD::XOR,
+                       ISD::SHL,
+                       ISD::SRL,
+                       ISD::SRA,
                        ISD::FSHR,
                        ISD::SINT_TO_FP,
                        ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
     switch (Op) {
     case ISD::LOAD:
     case ISD::STORE:
-
-    // These operations are done with 32-bit instructions anyway.
-    case ISD::AND:
-    case ISD::OR:
-    case ISD::XOR:
-    case ISD::SELECT:
-      // TODO: Extensions?
       return true;
     default:
       return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
 }
 
+static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
+  switch (Op->getOpcode()) {
+  case ISD::SRA:
+  case ISD::SMIN:
+  case ISD::SMAX:
+    return ISD::SIGN_EXTEND;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SRL:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    return ISD::ZERO_EXTEND;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SELECT:
+  case ISD::MUL:
+    // operation result won't be influenced by garbage high bits.
+    // TODO: are all of those cases correct, and are there more?
+    return ISD::ANY_EXTEND;
+  case ISD::SETCC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  }
+  default:
+    llvm_unreachable("unexpected opcode!");
+  }
+}
+
+SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
+                                                DAGCombinerInfo &DCI) const {
+  const unsigned Opc = Op.getOpcode();
+  assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
+         Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
+         Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
+         Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
+         Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
+
+  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+                                 : Op->getOperand(0).getValueType();
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Promote only if:
+  //    - We have 16 bit insts (not true 16 bit insts).
+  //    - We don't have packed instructions (for vector types only).
+  // TODO: For vector types, the set of packed operations is more limited, so
+  // may want to promote some anyway.
+  if (!Subtarget->has16BitInsts() ||
+      (OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
+    return SDValue();
+
+  // Promote uniform scalar and vector integers between 2 and 16 bits.
+  if (Op->isDivergent() || !OpTy.isInteger() ||
+      OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
+    return SDValue();
+
+  auto &DAG = DCI.DAG;
+
+  SDLoc DL(Op);
+  SDValue LHS;
+  SDValue RHS;
+  if (Opc == ISD::SELECT) {
+    LHS = Op->getOperand(1);
+    RHS = Op->getOperand(2);
+  } else {
+    LHS = Op->getOperand(0)...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/106383