[llvm] c01f284 - AMDGPU: Fix regressions in integer mad matching

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 8 13:48:53 PDT 2023


Author: Matt Arsenault
Date: 2023-06-08T16:48:47-04:00
New Revision: c01f284fbbf4c93c65b1f92fb8b3d263afb796c0

URL: https://github.com/llvm/llvm-project/commit/c01f284fbbf4c93c65b1f92fb8b3d263afb796c0
DIFF: https://github.com/llvm/llvm-project/commit/c01f284fbbf4c93c65b1f92fb8b3d263afb796c0.diff

LOG: AMDGPU: Fix regressions in integer mad matching

Undo the canonicalize done in
0cfc6510323fbb5a56a5de23cbc65f7cc30fd34c. Restores some regressed
matching of integer mad. The selection patterns fo the actual mads
don't seem to be properly commuting, so some of the commuted cases are
still missed.

Fixes: SWDEV-363009

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c0f75d17dd231..38a4bd9260913 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3516,6 +3516,16 @@ static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
 }
 
+/// If \p V is an add of a constant 1, returns the other operand. Otherwise
+/// return SDValue().
+static SDValue getAddOneOp(const SDNode *V) {
+  if (V->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1));
+  return C && C->isOne() ? V->getOperand(0) : SDValue();
+}
+
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
@@ -3531,16 +3541,49 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   if (VT.isVector() || Size > 64)
     return SDValue();
 
-  // There are i16 integer mul/mad.
-  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
-    return SDValue();
-
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
+  // matching.
+
+  // mul x, (add y, 1) -> add (mul x, y), x
+  auto IsFoldableAdd = [](SDValue V) -> SDValue {
+    SDValue AddOp = getAddOneOp(V.getNode());
+    if (!AddOp)
+      return SDValue();
+
+    if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
+          return U->getOpcode() == ISD::MUL;
+        }))
+      return AddOp;
+
+    return SDValue();
+  };
+
+  // FIXME: The selection pattern is not properly checking for commuted
+  // operands, so we have to place the mul in the LHS
+  if (SDValue MulOper = IsFoldableAdd(N0)) {
+    SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
+    return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
+  }
+
+  if (SDValue MulOper = IsFoldableAdd(N1)) {
+    SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
+    return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
+  }
+
+  // Skip if already mul24.
+  if (N->getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // There are i16 integer mul/mad.
+  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+    return SDValue();
+
   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
   // in the source into any_extends if the result of the mul is truncated. Since
   // we can assume the high bits are whatever we want, use the underlying value
@@ -4348,6 +4391,15 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performTruncateCombine(N, DCI);
   case ISD::MUL:
     return performMulCombine(N, DCI);
+  case AMDGPUISD::MUL_U24:
+  case AMDGPUISD::MUL_I24: {
+    if (SDValue Simplified = simplifyMul24(N, DCI))
+      return Simplified;
+    return performMulCombine(N, DCI);
+  }
+  case AMDGPUISD::MULHI_I24:
+  case AMDGPUISD::MULHI_U24:
+    return simplifyMul24(N, DCI);
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
     return performMulLoHiCombine(N, DCI);
@@ -4355,11 +4407,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performMulhsCombine(N, DCI);
   case ISD::MULHU:
     return performMulhuCombine(N, DCI);
-  case AMDGPUISD::MUL_I24:
-  case AMDGPUISD::MUL_U24:
-  case AMDGPUISD::MULHI_I24:
-  case AMDGPUISD::MULHI_U24:
-    return simplifyMul24(N, DCI);
   case ISD::SELECT:
     return performSelectCombine(N, DCI);
   case ISD::FNEG:

diff  --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 877f21eb23a8e..434988cf2678f 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -14,30 +14,28 @@ define i32 @v_mul_add_1_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_1_i32:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul = mul i32 %x, %add
@@ -48,30 +46,28 @@ define i32 @v_mul_add_1_i32_commute(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_1_i32_commute:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul = mul i32 %add, %x
@@ -352,30 +348,26 @@ define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
 ; GFX67-LABEL: v_mul_add_1_i24_zext:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i24_zext:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i24_zext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i24_zext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i24 %y, 1
   %mul = mul i24 %x, %add
@@ -450,30 +442,26 @@ define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) {
 ; GFX67-LABEL: v_mul_add_1_i24_sext:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i24_sext:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i24_sext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i24_sext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i24 %y, 1
   %mul = mul i24 %x, %add
@@ -548,30 +536,28 @@ define i25 @v_mul_add_1_i25_zext(i25 zeroext %x, i25 zeroext %y) {
 ; GFX67-LABEL: v_mul_add_1_i25_zext:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i25_zext:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i25_zext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i25_zext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i25 %y, 1
   %mul = mul i25 %x, %add
@@ -616,30 +602,28 @@ define i25 @v_mul_add_1_i25_sext(i25 signext %x, i25 signext %y) {
 ; GFX67-LABEL: v_mul_add_1_i25_sext:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i25_sext:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i25_sext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i25_sext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i25 %y, 1
   %mul = mul i25 %x, %add
@@ -693,23 +677,20 @@ define i16 @v_mul_add_1_i16(i16 %x, i16 %y) {
 ; GFX8-LABEL: v_mul_add_1_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 1
   %mul = mul i16 %x, %add
@@ -730,23 +711,20 @@ define i32 @v_mul_add_1_i16_zext_result(i16 %x, i16 %y) {
 ; GFX8-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 1
@@ -768,23 +746,20 @@ define i16 @v_mul_add_1_i16_commute(i16 %x, i16 %y) {
 ; GFX8-LABEL: v_mul_add_1_i16_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i16_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i16_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 1
   %mul = mul i16 %add, %x
@@ -1007,63 +982,57 @@ define i64 @v_mul_add_1_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_add_1_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_mul_lo_u32 v5, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_add_1_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX7-NEXT:    v_mul_lo_u32 v3, v0, v1
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
-; GFX7-NEXT:    v_mul_lo_u32 v2, v4, v2
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v4, v1, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, v0, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v2, 0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %y, 1
   %mul = mul i64 %x, %add
@@ -1074,63 +1043,57 @@ define i64 @v_mul_add_1_i64_commute(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_add_1_i64_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_add_1_i64_commute:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
-; GFX7-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX7-NEXT:    v_mul_lo_u32 v3, v0, v1
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v4, 0
-; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i64_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v2
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v4, 0
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i64_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i64_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v1
-; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v0, 0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %y, 1
   %mul = mul i64 %add, %x
@@ -1535,34 +1498,45 @@ define <2 x i32> @v_mul_add_1_i32_multiple(i32 %x, i32 %y, i32 %z) {
 ; GFX67-LABEL: v_mul_add_1_i32_multiple:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    v_mul_lo_u32 v3, v0, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_multiple:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_mul_add_1_i32_multiple:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mul_add_1_i32_multiple:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v3, v[0:1]
+; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v3, v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_mul_add_1_i32_multiple:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v3, v[0:1]
+; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_multiple:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
-; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v3, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v2, v3, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul0 = mul i32 %x, %add
@@ -1617,8 +1591,8 @@ define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) {
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, v1, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_chain:
@@ -1628,8 +1602,8 @@ define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) {
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32_chain:
@@ -1639,8 +1613,7 @@ define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) {
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_chain:
@@ -1650,9 +1623,8 @@ define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %i2 = add i32 %arg0, 1
   %i3 = mul i32 %i2, %arg1
@@ -1680,12 +1652,12 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-LABEL: v_mul_add_1_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, 1
-; GFX8-NEXT:    v_add_u16_e32 v2, 1, v1
-; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i16:
@@ -1725,12 +1697,12 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, 1
-; GFX8-NEXT:    v_add_u16_e32 v2, 1, v1
-; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i16_commute:
@@ -2030,38 +2002,45 @@ define <2 x i32> @v_mul_add_1_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i32:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX67-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX67-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_mul_add_1_v2i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mul_add_1_v2i32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v3, v[1:2]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_mul_add_1_v2i32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
+; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
+; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, v[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v3, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i32> %y, <i32 1, i32 1>
   %mul = mul <2 x i32> %x, %add
@@ -2072,38 +2051,45 @@ define <2 x i32> @v_mul_add_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX67-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX67-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
-; GFX8-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_mul_add_1_v2i32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v3, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mul_add_1_v2i32_commute:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
+; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v3, v[1:2]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: v_mul_add_1_v2i32_commute:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
+; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
+; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, v[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v3, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i32> %y, <i32 1, i32 1>
   %mul = mul <2 x i32> %add, %x
@@ -2382,38 +2368,30 @@ define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i24:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
-; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX67-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i24:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX10-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i24> %y, <i24 1, i24 1>
   %mul = mul <2 x i24> %x, %add
@@ -2424,38 +2402,30 @@ define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
-; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX67-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
-; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
-; GFX10-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
+; GFX10-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i24> %y, <i24 1, i24 1>
   %mul = mul <2 x i24> %add, %x
@@ -3318,36 +3288,36 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX67:       ; %bb.0: ; %bb
 ; GFX67-NEXT:    s_load_dword s3, s[0:1], 0x6
 ; GFX67-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX67-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-NEXT:    s_load_dword s6, s[6:7], 0x1
+; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX67-NEXT:    s_add_i32 s3, s3, 1
 ; GFX67-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX67-NEXT:    s_mul_i32 s2, s2, s6
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, s3, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v2, v2, v0
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4
-; GFX67-NEXT:    s_load_dword s3, s[6:7], 0x1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v2, v1
-; GFX67-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX67-NEXT:    s_mov_b32 s3, 0xf000
+; GFX67-NEXT:    s_mov_b32 s2, 0
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v3
-; GFX67-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v2
-; GFX67-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX67-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v2, v1
-; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v3
-; GFX67-NEXT:    s_mov_b32 s3, 0xf000
-; GFX67-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT:    v_mov_b32_e32 v2, s5
+; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX67-NEXT:    v_add_i32_e32 v3, vcc, v3, v1
+; GFX67-NEXT:    v_mul_lo_u32 v4, v3, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-NEXT:    v_mul_lo_u32 v2, v2, v1
-; GFX67-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX67-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX67-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; GFX67-NEXT:    s_mov_b32 s2, 0
+; GFX67-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
 ; GFX67-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX67-NEXT:    s_endpgm
 ;
@@ -3355,37 +3325,37 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX8:       ; %bb.0: ; %bb
 ; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x18
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x10
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    s_add_i32 s3, s3, 1
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s3, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x10
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x4
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v2, v1
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, s3, 0xffff
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v3
-; GFX8-NEXT:    s_mul_i32 s2, s2, s3
+; GFX8-NEXT:    s_mul_i32 s2, s2, s1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3397,7 +3367,7 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_add_i32 s3, s3, 1
 ; GFX900-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, s9
+; GFX900-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX900-NEXT:    v_add_u32_e32 v2, s3, v1
 ; GFX900-NEXT:    v_mul_lo_u32 v2, v2, v0
 ; GFX900-NEXT:    v_add_u32_e32 v1, 1, v1
@@ -3411,20 +3381,18 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX900-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX900-NEXT:    v_add_u32_e32 v2, 1, v3
 ; GFX900-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, s1
+; GFX900-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX900-NEXT:    v_mul_lo_u32 v3, v1, v2
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX900-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX900-NEXT:    v_mul_lo_u32 v2, v2, v1
-; GFX900-NEXT:    v_add_u32_e32 v1, 1, v3
-; GFX900-NEXT:    v_mul_lo_u32 v3, v2, v1
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
-; GFX900-NEXT:    v_add_u32_e32 v2, 1, v2
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX900-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX900-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v1, v3, v[1:2]
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[3:4], 2, v[3:4]
+; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v1, v[2:3]
+; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s8, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
+; GFX900-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: compute_mad:
@@ -3438,23 +3406,21 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX90A-NEXT:    v_add_u32_e32 v2, s3, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX90A-NEXT:    s_load_dword s3, s[6:7], 0x4
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v3
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    s_load_dword s3, s[6:7], 0x4
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v1
+; GFX90A-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v3, v[2:3]
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v2, v1
-; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v3
 ; GFX90A-NEXT:    s_mul_i32 s2, s2, s3
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v3
+; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3485,20 +3451,18 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v3, v2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, s2, s3, v[0:1]
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v2
-; GFX10-NEXT:    v_add_co_u32 v0, s2, s4, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s2
-; GFX10-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX10-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_mul_lo_u32 v1, v3, v2
+; GFX10-NEXT:    v_add_co_u32 v2, s2, s4, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, null, s5, 0, s2
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5]
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, s0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX10-NEXT:    s_endpgm
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@@ -3603,23 +3567,20 @@ define i8 @v_mul_add_1_i8(i8 %x, i8 %y) {
 ; GFX8-LABEL: v_mul_add_1_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %x, %add
@@ -3639,23 +3600,20 @@ define i8 @v_mul_add_1_i8_commute(i8 %x, i8 %y) {
 ; GFX8-LABEL: v_mul_add_1_i8_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %add, %x
@@ -3674,23 +3632,20 @@ define i8 @v_mul_add_1_i8_zext(i8 zeroext %x, i8 zeroext %y) {
 ; GFX8-LABEL: v_mul_add_1_i8_zext:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8_zext:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8_zext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %x, %add
@@ -3709,23 +3664,20 @@ define i8 @v_mul_add_1_i8_zext_commute(i8 zeroext %x, i8 zeroext %y) {
 ; GFX8-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %add, %x
@@ -3752,11 +3704,9 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX8-LABEL: v_mul_add_1_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v3, 1, v3
-; GFX8-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX8-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX8-NEXT:    v_mad_u16 v1, v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX8-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3764,11 +3714,9 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX9-LABEL: v_mul_add_1_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v3, 1, v3
-; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v2, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3777,10 +3725,8 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v3, v3, 1
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX10-NEXT:    v_mad_u16 v1, v1, v3, v1
+; GFX10-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -3810,11 +3756,9 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
 ; GFX8-LABEL: v_mul_add_1_v2i8_commute:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v3, 1, v3
-; GFX8-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX8-NEXT:    v_mul_lo_u16_e32 v1, v3, v1
+; GFX8-NEXT:    v_mad_u16 v1, v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
-; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
+; GFX8-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3822,11 +3766,9 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
 ; GFX9-LABEL: v_mul_add_1_v2i8_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v3, 1, v3
-; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v3, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v2, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3835,10 +3777,8 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u16 v3, v3, 1
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_mul_lo_u16 v1, v3, v1
-; GFX10-NEXT:    v_mul_lo_u16 v0, v2, v0
+; GFX10-NEXT:    v_mad_u16 v1, v1, v3, v1
+; GFX10-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD


        


More information about the llvm-commits mailing list