[llvm] 42e0d30 - [NVPTX] Enhance `mul.wide` and `mad.wide` peepholes (#150477)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 30 08:57:28 PDT 2025
Author: Justin Fargnoli
Date: 2025-07-30T08:57:19-07:00
New Revision: 42e0d302686657cc381dc49033f68daa0f09b046
URL: https://github.com/llvm/llvm-project/commit/42e0d302686657cc381dc49033f68daa0f09b046
DIFF: https://github.com/llvm/llvm-project/commit/42e0d302686657cc381dc49033f68daa0f09b046.diff
LOG: [NVPTX] Enhance `mul.wide` and `mad.wide` peepholes (#150477)
Implements `(sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)`
as a DAG combine.
Implements `(add (mul.wide a, b), c) -> (mad.wide a, b, c)` in
instruction selection.
Added:
llvm/test/CodeGen/NVPTX/combine-wide.ll
Modified:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/test/CodeGen/NVPTX/bug26185-2.ll
llvm/test/CodeGen/NVPTX/local-stack-frame.ll
llvm/test/CodeGen/NVPTX/vector-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 96f522759b0ea..95abcded46485 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
CodeGenOptLevel OptLevel)
- : SelectionDAGISel(tm, OptLevel), TM(tm) {
- doMulWide = (OptLevel > CodeGenOptLevel::None);
-}
+ : SelectionDAGISel(tm, OptLevel), TM(tm) {}
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index e504a8fe32ecb..9e0f88e544980 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -40,9 +40,6 @@ struct NVPTXScopes {
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
const NVPTXTargetMachine &TM;
- // If true, generate mul.wide from sext and mul
- bool doMulWide;
-
NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
bool usePrecSqrtF32(const SDNode *N) const;
bool useF32FTZ() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f79b8629f01e2..4fd362303b6e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
- ISD::STORE});
+ ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5219,6 +5219,42 @@ static SDValue PerformREMCombine(SDNode *N,
return SDValue();
}
+// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
+static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.hasOneUse())
+ return SDValue();
+ EVT ToVT = N->getValueType(0);
+ EVT FromVT = Op.getValueType();
+ if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
+ (ToVT == MVT::i64 && FromVT == MVT::i32)))
+ return SDValue();
+ if (!(Op.getOpcode() == ISD::MUL ||
+ (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned ExtOpcode = N->getOpcode();
+ unsigned Opcode = 0;
+ if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
+ Opcode = NVPTXISD::MUL_WIDE_SIGNED;
+ else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
+ Opcode = NVPTXISD::MUL_WIDE_UNSIGNED;
+ else
+ return SDValue();
+ SDValue RHS = Op.getOperand(1);
+ if (Op.getOpcode() == ISD::SHL) {
+ const auto ShiftAmt = Op.getConstantOperandVal(1);
+ const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
+ RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
+ }
+ return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
+}
+
enum OperandSignedness {
Signed = 0,
Unsigned,
@@ -5825,6 +5861,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return combineADDRSPACECAST(N, DCI);
case ISD::AND:
return PerformANDCombine(N, DCI);
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return combineMulWide(N, DCI, OptLevel);
case ISD::BUILD_VECTOR:
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 86d6f7c3fc3a3..41bfe7edfa9bb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -125,8 +125,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
-def doMulWide : Predicate<"doMulWide">;
-
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
@@ -836,36 +834,28 @@ def MULWIDES64 :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">;
def MULWIDES64Imm :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">;
-def MULWIDES64Imm64 :
- BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">;
def MULWIDEU64 :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">;
def MULWIDEU64Imm :
BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">;
-def MULWIDEU64Imm64 :
- BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">;
def MULWIDES32 :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">;
def MULWIDES32Imm :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">;
-def MULWIDES32Imm32 :
- BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">;
def MULWIDEU32 :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">;
def MULWIDEU32Imm :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">;
-def MULWIDEU32Imm32 :
- BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">;
-def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>;
// Matchers for signed, unsigned mul.wide ISD nodes.
-let Predicates = [doMulWide] in {
+let Predicates = [hasOptEnabled] in {
def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>;
def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>;
def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>;
@@ -877,85 +867,6 @@ let Predicates = [doMulWide] in {
def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>;
}
-// Predicates used for converting some patterns to mul.wide.
-def SInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(32);
-}]>;
-
-def UInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(32);
-}]>;
-
-def SInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(16);
-}]>;
-
-def UInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(16);
-}]>;
-
-def IntConst_0_30 : PatLeaf<(imm), [{
- // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(31);
-}]>;
-
-def IntConst_0_14 : PatLeaf<(imm), [{
- // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(15);
-}]>;
-
-def SHL2MUL32 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(32, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
-}]>;
-
-def SHL2MUL16 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(16, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
-}]>;
-
-// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
-let Predicates = [doMulWide] in {
- def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)),
- (MULWIDES64Imm $a, (SHL2MUL32 $b))>;
- def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
- (MULWIDEU64Imm $a, (SHL2MUL32 $b))>;
-
- def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
- (MULWIDES32Imm $a, (SHL2MUL16 $b))>;
- def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
- (MULWIDEU32Imm $a, (SHL2MUL16 $b))>;
-
- // Convert "sign/zero-extend then multiply" to mul.wide.
- def : Pat<(mul (sext i32:$a), (sext i32:$b)),
- (MULWIDES64 $a, $b)>;
- def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
- (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>;
-
- def : Pat<(mul (zext i32:$a), (zext i32:$b)),
- (MULWIDEU64 $a, $b)>;
- def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
- (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>;
-
- def : Pat<(mul (sext i16:$a), (sext i16:$b)),
- (MULWIDES32 $a, $b)>;
- def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
- (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>;
-
- def : Pat<(mul (zext i16:$a), (zext i16:$b)),
- (MULWIDEU32 $a, $b)>;
- def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
- (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>;
-}
-
//
// Integer multiply-add
//
@@ -991,6 +902,39 @@ defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>;
defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>;
}
+multiclass MAD_WIDE<string PtxSuffix, OneUse2 Op, RegTyInfo BigT, RegTyInfo SmallT> {
+ def rrr:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.RC:$b, BigT.RC:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), BigT.Ty:$c))]>;
+ def rri:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.RC:$b, BigT.Imm:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), imm:$c))]>;
+ def rir:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.RC:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), BigT.Ty:$c))]>;
+ def rii:
+ BasicNVPTXInst<(outs BigT.RC:$dst),
+ (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.Imm:$c),
+ "mad.wide." # PtxSuffix,
+ [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), imm:$c))]>;
+}
+
+def mul_wide_unsigned_oneuse : OneUse2<mul_wide_unsigned>;
+def mul_wide_signed_oneuse : OneUse2<mul_wide_signed>;
+
+let Predicates = [hasOptEnabled] in {
+defm MAD_WIDE_U16 : MAD_WIDE<"u16", mul_wide_unsigned_oneuse, I32RT, I16RT>;
+defm MAD_WIDE_S16 : MAD_WIDE<"s16", mul_wide_signed_oneuse, I32RT, I16RT>;
+defm MAD_WIDE_U32 : MAD_WIDE<"u32", mul_wide_unsigned_oneuse, I64RT, I32RT>;
+defm MAD_WIDE_S32 : MAD_WIDE<"s32", mul_wide_signed_oneuse, I64RT, I32RT>;
+}
+
foreach t = [I16RT, I32RT, I64RT] in {
def NEG_S # t.Size :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index 4e11f58f85ee0..46172b1af1236 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -16,7 +16,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
; CHECK: .maxntid 1, 1, 1
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %bb
; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0];
@@ -25,10 +25,9 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1];
; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16];
-; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1;
-; CHECK-NEXT: ld.global.b64 %rd7, [%rd5];
-; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7;
-; CHECK-NEXT: st.global.b64 [%rd5], %rd8;
+; CHECK-NEXT: ld.global.b64 %rd6, [%rd5];
+; CHECK-NEXT: mad.wide.s32 %rd7, %r1, %r1, %rd6;
+; CHECK-NEXT: st.global.b64 [%rd5], %rd7;
; CHECK-NEXT: ret;
bb:
%tmp5 = add nsw i64 %arg3, 8
diff --git a/llvm/test/CodeGen/NVPTX/combine-wide.ll b/llvm/test/CodeGen/NVPTX/combine-wide.ll
new file mode 100644
index 0000000000000..ed4a2b6e419c3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/combine-wide.ll
@@ -0,0 +1,1339 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O1 | FileCheck %s --check-prefixes=CHECK,O1
+; RUN: llc < %s -O0 | FileCheck %s --check-prefixes=CHECK,O0
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i64 @t1(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t1(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t1_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t1_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t1_param_2];
+; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t1(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t1_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t1_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t1_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd1, %rd2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ %add = add i64 %c, %sext
+ ret i64 %add
+}
+
+define i64 @t2(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t2(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t2_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t2_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t2_param_2];
+; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t2(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t2_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t2_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t2_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd2, %rd1;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ %add = add i64 %sext, %c
+ ret i64 %add
+}
+
+define i64 @t3(i32 %a, i32 %b) {
+;
+; O1-LABEL: t3(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t3_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t3_param_1];
+; O1-NEXT: mad.wide.s32 %rd1, %r1, %r2, 1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t3(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t3_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t3_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: add.s64 %rd2, %rd1, 1;
+; O0-NEXT: st.param.b64 [func_retval0], %rd2;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ %add = add i64 1, %sext
+ ret i64 %add
+}
+
+define i64 @t4(i32 %a, i64 %c) {
+;
+; O1-LABEL: t4(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t4_param_0];
+; O1-NEXT: ld.param.b64 %rd1, [t4_param_1];
+; O1-NEXT: mad.wide.s32 %rd2, %r1, 3, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t4(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t4_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t4_param_0];
+; O0-NEXT: mul.lo.s32 %r2, %r1, 3;
+; O0-NEXT: cvt.s64.s32 %rd2, %r2;
+; O0-NEXT: add.s64 %rd3, %rd1, %rd2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, 3
+ %sext = sext i32 %mul to i64
+ %add = add i64 %c, %sext
+ ret i64 %add
+}
+
+define i64 @t4_1(i32 %a, i64 %c) {
+;
+; O1-LABEL: t4_1(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t4_1_param_0];
+; O1-NEXT: mad.wide.s32 %rd1, %r1, 3, 5;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t4_1(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t4_1_param_0];
+; O0-NEXT: mul.lo.s32 %r2, %r1, 3;
+; O0-NEXT: cvt.s64.s32 %rd1, %r2;
+; O0-NEXT: add.s64 %rd2, %rd1, 5;
+; O0-NEXT: st.param.b64 [func_retval0], %rd2;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, 3
+ %sext = sext i32 %mul to i64
+ %add = add i64 5, %sext
+ ret i64 %add
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t5(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t5_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t5_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t5_param_2];
+; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t5(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t5_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t5_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t5_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd1, %rd2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ %add = add i64 %c, %zext
+ ret i64 %add
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t6(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t6_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t6_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t6_param_2];
+; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t6(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t6_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t6_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t6_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd2, %rd1;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ %add = add i64 %zext, %c
+ ret i64 %add
+}
+
+define i32 @t7(i16 %a, i16 %b) {
+;
+; O1-LABEL: t7(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t7_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t7_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t7(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t7_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t7_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t8(i16 %a, i16 %b) {
+;
+; O1-LABEL: t8(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t8_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t8_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.s32.s16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t8(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t8_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t8_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t9(i32 %a, i32 %b) {
+;
+; O1-LABEL: t9(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t9_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t9_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.u64.u32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t9(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t9_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t9_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul i32 %a, %b
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t10(i32 %a, i32 %b) {
+;
+; O1-LABEL: t10(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t10_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t10_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.s64.s32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t10(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t10_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t10_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t11(i16 %a, i16 %b) {
+;
+; O1-LABEL: t11(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t11_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t11_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t11(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t11_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t11_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t12(i16 %a, i16 %b) {
+;
+; O1-LABEL: t12(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t12_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t12_param_1];
+; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t12(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t12_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t12_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t13(i32 %a, i32 %b) {
+;
+; O1-LABEL: t13(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t13_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t13_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.u64.u32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t13(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t13_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t13_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t14(i32 %a, i32 %b) {
+;
+; O1-LABEL: t14(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t14_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t14_param_1];
+; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t14(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t14_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t14_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t15(i16 %a, i16 %b) {
+;
+; O1-LABEL: t15(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t15_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t15_param_1];
+; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t15(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t15_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t15_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t16(i16 %a, i16 %b) {
+;
+; O1-LABEL: t16(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t16_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t16_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.s32.s16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t16(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t16_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t16_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t17(i32 %a, i32 %b) {
+;
+; O1-LABEL: t17(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t17_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t17_param_1];
+; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t17(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t17_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t17_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t18(i32 %a, i32 %b) {
+;
+; O1-LABEL: t18(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t18_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t18_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.s64.s32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t18(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t18_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t18_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t19(i16 %a, i16 %b) {
+;
+; O1-LABEL: t19(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t19_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t19_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t19(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t19_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t19_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t20(i16 %a) {
+;
+; CHECK-LABEL: t20(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [t20_param_0];
+; CHECK-NEXT: shl.b16 %rs2, %rs1, 4;
+; CHECK-NEXT: cvt.s32.s16 %r1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %mul = shl i16 %a, 4
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t21(i32 %a) {
+;
+; CHECK-LABEL: t21(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t21_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl i32 %a, 4
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t22(i32 %a) {
+;
+; CHECK-LABEL: t22(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t22_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.s64.s32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl i32 %a, 4
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t23(i16 %a, i16 %b) {
+;
+; CHECK-LABEL: t23(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [t23_param_0];
+; CHECK-NEXT: shl.b16 %rs2, %rs1, 4;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %mul = shl nsw i16 %a, 4
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t24(i16 %a, i16 %b) {
+;
+; O1-LABEL: t24(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t24_param_0];
+; O1-NEXT: mul.wide.s16 %r1, %rs1, 16;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t24(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [t24_param_0];
+; O0-NEXT: shl.b16 %rs2, %rs1, 4;
+; O0-NEXT: cvt.s32.s16 %r1, %rs2;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = shl nsw i16 %a, 4
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t25(i32 %a) {
+;
+; CHECK-LABEL: t25(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t25_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl nsw i32 %a, 4
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t26(i32 %a) {
+;
+; O1-LABEL: t26(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t26_param_0];
+; O1-NEXT: mul.wide.s32 %rd1, %r1, 16;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t26(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t26_param_0];
+; O0-NEXT: shl.b32 %r2, %r1, 4;
+; O0-NEXT: cvt.s64.s32 %rd1, %r2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = shl nsw i32 %a, 4
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t27(i16 %a, i16 %b) {
+;
+; O1-LABEL: t27(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t27_param_0];
+; O1-NEXT: mul.wide.u16 %r1, %rs1, 16;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t27(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [t27_param_0];
+; O0-NEXT: shl.b16 %rs2, %rs1, 4;
+; O0-NEXT: cvt.u32.u16 %r1, %rs2;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = shl nuw i16 %a, 4
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t28(i16 %a, i16 %b) {
+;
+; CHECK-LABEL: t28(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [t28_param_0];
+; CHECK-NEXT: shl.b16 %rs2, %rs1, 4;
+; CHECK-NEXT: cvt.s32.s16 %r1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %mul = shl nuw i16 %a, 4
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t29(i32 %a) {
+;
+; O1-LABEL: t29(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t29_param_0];
+; O1-NEXT: mul.wide.u32 %rd1, %r1, 16;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t29(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t29_param_0];
+; O0-NEXT: shl.b32 %r2, %r1, 4;
+; O0-NEXT: cvt.u64.u32 %rd1, %r2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = shl nuw i32 %a, 4
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t30(i32 %a) {
+;
+; CHECK-LABEL: t30(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t30_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.s64.s32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl nuw i32 %a, 4
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i64 @t31(i32 %a, i32 %b) {
+;
+; O1-LABEL: t31(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t31_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t31_param_1];
+; O1-NEXT: shl.b32 %r3, %r1, %r2;
+; O1-NEXT: cvt.s64.s32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t31(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t31_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t31_param_0];
+; O0-NEXT: shl.b32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = shl nuw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t32(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t32(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t32_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t32_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t32_param_2];
+; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t32(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t32_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t32_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t32_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ %add = add i32 %c, %sext
+ ret i32 %add
+}
+
+define i32 @t33(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t33(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t33_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t33_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t33_param_2];
+; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t33(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t33_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t33_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t33_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ %add = add i32 %c, %sext
+ ret i32 %add
+}
+
+define i32 @t34(i16 %a, i16 %b) {
+;
+; O1-LABEL: t34(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t34_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t34_param_1];
+; O1-NEXT: mad.wide.s16 %r1, %rs1, %rs2, 1;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t34(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t34_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t34_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: add.s32 %r2, %r1, 1;
+; O0-NEXT: st.param.b32 [func_retval0], %r2;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ %add = add i32 1, %sext
+ ret i32 %add
+}
+
+define i32 @t35(i16 %a, i32 %c) {
+;
+; O1-LABEL: t35(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t35_param_0];
+; O1-NEXT: ld.param.b32 %r1, [t35_param_1];
+; O1-NEXT: mad.wide.s16 %r2, %rs1, 3, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t35(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t35_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t35_param_0];
+; O0-NEXT: mul.lo.s16 %rs2, %rs1, 3;
+; O0-NEXT: cvt.s32.s16 %r2, %rs2;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, 3
+ %sext = sext i16 %mul to i32
+ %add = add i32 %c, %sext
+ ret i32 %add
+}
+
+define i32 @t36(i16 %a, i32 %c) {
+;
+; O1-LABEL: t36(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t36_param_0];
+; O1-NEXT: mad.wide.s16 %r1, %rs1, 3, 5;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t36(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [t36_param_0];
+; O0-NEXT: mul.lo.s16 %rs2, %rs1, 3;
+; O0-NEXT: cvt.s32.s16 %r1, %rs2;
+; O0-NEXT: add.s32 %r2, %r1, 5;
+; O0-NEXT: st.param.b32 [func_retval0], %r2;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, 3
+ %sext = sext i16 %mul to i32
+ %add = add i32 5, %sext
+ ret i32 %add
+}
+
+define i32 @t37(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t37(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t37_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t37_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t37_param_2];
+; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t37(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t37_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t37_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t37_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ %add = add i32 %c, %zext
+ ret i32 %add
+}
+
+define i32 @t38(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t38(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t38_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t38_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t38_param_2];
+; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t38(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t38_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t38_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t38_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r2, %r1;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ %add = add i32 %zext, %c
+ ret i32 %add
+}
+
+define i64 @t39(i16 %a, i16 %b) {
+; O1-LABEL: t39(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t39_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t39_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t39(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t39_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t39_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %zext = zext i16 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t40(i16 %a, i16 %b) {
+; O1-LABEL: t40(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t40_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t40_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t40(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t40_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t40_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t41(i16 %a, i16 %b) {
+; O1-LABEL: t41(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t41_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t41_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.s64.s16 %rd1, %rs3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t41(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t41_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t41_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s64.s16 %rd1, %rs3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t42(i16 %a, i16 %b, ptr %ptr) {
+; O1-LABEL: t42(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t42_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t42_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: ld.param.b64 %rd1, [t42_param_2];
+; O1-NEXT: st.b16 [%rd1], %rs3;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t42(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t42_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t42_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t42_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: st.b16 [%rd1], %rs3;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ store i16 %mul, ptr %ptr
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t43(i16 %a, i16 %b, i32 %c, ptr %ptr) {
+; O1-LABEL: t43(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t43_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t43_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: ld.param.b64 %rd1, [t43_param_3];
+; O1-NEXT: st.b16 [%rd1], %rs3;
+; O1-NEXT: ld.param.b32 %r1, [t43_param_2];
+; O1-NEXT: cvt.u32.u16 %r2, %rs3;
+; O1-NEXT: add.s32 %r3, %r2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r3;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t43(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t43_param_3];
+; O0-NEXT: ld.param.b32 %r1, [t43_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t43_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t43_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: st.b16 [%rd1], %rs3;
+; O0-NEXT: cvt.u32.u16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r2, %r1;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ store i16 %mul, ptr %ptr
+ %zext = zext i16 %mul to i32
+ %add = add i32 %zext, %c
+ ret i32 %add
+}
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index 5c3017310d0a3..ae069cf956c36 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -114,15 +114,14 @@ define void @foo3(i32 %a) {
; PTX64-NEXT: .reg .b64 %SP;
; PTX64-NEXT: .reg .b64 %SPL;
; PTX64-NEXT: .reg .b32 %r<2>;
-; PTX64-NEXT: .reg .b64 %rd<5>;
+; PTX64-NEXT: .reg .b64 %rd<4>;
; PTX64-EMPTY:
; PTX64-NEXT: // %bb.0:
; PTX64-NEXT: mov.b64 %SPL, __local_depot2;
; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0];
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
-; PTX64-NEXT: mul.wide.s32 %rd3, %r1, 4;
-; PTX64-NEXT: add.s64 %rd4, %rd2, %rd3;
-; PTX64-NEXT: st.local.b32 [%rd4], %r1;
+; PTX64-NEXT: mad.wide.s32 %rd3, %r1, 4, %rd2;
+; PTX64-NEXT: st.local.b32 [%rd3], %r1;
; PTX64-NEXT: ret;
%local = alloca [3 x i32], align 4
%1 = getelementptr inbounds i32, ptr %local, i32 %a
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index e16fc74325416..6f0dff78d5569 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -154,7 +154,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<4>;
; CHECK-NEXT: .reg .b32 %r<8>;
-; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [foo_complex_param_0];
@@ -166,12 +166,11 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
; CHECK-NEXT: shl.b32 %r6, %r1, 1;
; CHECK-NEXT: or.b32 %r7, %r5, %r6;
; CHECK-NEXT: cvt.u64.u32 %rd2, %r7;
-; CHECK-NEXT: mul.wide.u32 %rd3, %r3, 131072;
-; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: add.s64 %rd5, %rd4, %rd2;
-; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd5+128];
+; CHECK-NEXT: mad.wide.u32 %rd3, %r3, 131072, %rd1;
+; CHECK-NEXT: add.s64 %rd4, %rd3, %rd2;
+; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd4+128];
; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.b8 [%rd5+129], %rs3;
+; CHECK-NEXT: st.b8 [%rd4+129], %rs3;
; CHECK-NEXT: ret;
%t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1
%t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
More information about the llvm-commits
mailing list