[llvm] [NVPTX] Fix v2i8 call lowering, use generic ld/st nodes for call params (PR #146930)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 3 10:28:47 PDT 2025
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/146930
None
>From 80de180223ceb6fec1fcfec9ede7695c098611fd Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 3 Jul 2025 02:44:43 +0000
Subject: [PATCH] [NVPTX] Fixup v2i8 call lowering, use generic load/store
nodes for call params
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 273 --------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 -
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 591 +++++++----------
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 10 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 133 +---
.../test/CodeGen/NVPTX/bf16x2-instructions.ll | 6 +-
llvm/test/CodeGen/NVPTX/byval-const-global.ll | 8 +-
.../CodeGen/NVPTX/call-with-alloca-buffer.ll | 10 +-
llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll | 16 +-
llvm/test/CodeGen/NVPTX/combine-mad.ll | 4 +-
llvm/test/CodeGen/NVPTX/compare-int.ll | 621 ++++++++++++++----
.../CodeGen/NVPTX/convert-call-to-indirect.ll | 172 ++++-
llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll | 4 +-
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 12 +-
llvm/test/CodeGen/NVPTX/fma.ll | 8 +-
llvm/test/CodeGen/NVPTX/forward-ld-param.ll | 2 +-
llvm/test/CodeGen/NVPTX/i128-param.ll | 20 +-
llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 12 +-
llvm/test/CodeGen/NVPTX/i8x2-instructions.ll | 121 +++-
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 12 +-
llvm/test/CodeGen/NVPTX/idioms.ll | 2 +-
llvm/test/CodeGen/NVPTX/indirect_byval.ll | 20 +-
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 33 +-
llvm/test/CodeGen/NVPTX/lower-args.ll | 10 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 2 +-
llvm/test/CodeGen/NVPTX/misched_func_call.ll | 15 +-
llvm/test/CodeGen/NVPTX/param-add.ll | 8 +-
llvm/test/CodeGen/NVPTX/param-load-store.ll | 258 ++++----
llvm/test/CodeGen/NVPTX/param-overalign.ll | 4 +-
.../CodeGen/NVPTX/param-vectorize-device.ll | 28 +-
llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir | 4 +-
llvm/test/CodeGen/NVPTX/st-param-imm.ll | 221 ++++---
llvm/test/CodeGen/NVPTX/store-undef.ll | 4 +-
llvm/test/CodeGen/NVPTX/tex-read-cuda.ll | 2 +-
.../NVPTX/unaligned-param-load-store.ll | 594 +++++++----------
llvm/test/CodeGen/NVPTX/vaargs.ll | 16 +-
llvm/test/CodeGen/NVPTX/variadics-backend.ll | 32 +-
37 files changed, 1642 insertions(+), 1648 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 5631342ecc13e..fdd2671a5289e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -145,18 +145,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryStoreVector(N))
return;
break;
- case NVPTXISD::LoadParam:
- case NVPTXISD::LoadParamV2:
- case NVPTXISD::LoadParamV4:
- if (tryLoadParam(N))
- return;
- break;
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- if (tryStoreParam(N))
- return;
- break;
case ISD::INTRINSIC_W_CHAIN:
if (tryIntrinsicChain(N))
return;
@@ -1429,267 +1417,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return true;
}
-bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
- SDValue Chain = Node->getOperand(0);
- SDValue Offset = Node->getOperand(2);
- SDValue Glue = Node->getOperand(3);
- SDLoc DL(Node);
- MemSDNode *Mem = cast<MemSDNode>(Node);
-
- unsigned VecSize;
- switch (Node->getOpcode()) {
- default:
- return false;
- case NVPTXISD::LoadParam:
- VecSize = 1;
- break;
- case NVPTXISD::LoadParamV2:
- VecSize = 2;
- break;
- case NVPTXISD::LoadParamV4:
- VecSize = 4;
- break;
- }
-
- EVT EltVT = Node->getValueType(0);
- EVT MemVT = Mem->getMemoryVT();
-
- std::optional<unsigned> Opcode;
-
- switch (VecSize) {
- default:
- return false;
- case 1:
- Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
- NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
- NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64);
- break;
- case 2:
- Opcode =
- pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
- NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
- NVPTX::LoadParamMemV2I64);
- break;
- case 4:
- Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
- NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16,
- NVPTX::LoadParamMemV4I32, {/* no v4i64 */});
- break;
- }
- if (!Opcode)
- return false;
-
- SDVTList VTs;
- if (VecSize == 1) {
- VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
- } else if (VecSize == 2) {
- VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
- } else {
- EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
- VTs = CurDAG->getVTList(EVTs);
- }
-
- unsigned OffsetVal = Offset->getAsZExtVal();
-
- SmallVector<SDValue, 2> Ops(
- {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
- ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
- return true;
-}
-
-// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
-#define getOpcV2H(ty, opKind0, opKind1) \
- NVPTX::StoreParamV2##ty##_##opKind0##opKind1
-
-#define getOpcV2H1(ty, opKind0, isImm1) \
- (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
-
-#define getOpcodeForVectorStParamV2(ty, isimm) \
- (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
-
-#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
- NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
-
-#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
- (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
- : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
-
-#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
- (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
- : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
-
-#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
- (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
- : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
-
-#define getOpcodeForVectorStParamV4(ty, isimm) \
- (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
- : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
-
-#define getOpcodeForVectorStParam(n, ty, isimm) \
- (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
- : getOpcodeForVectorStParamV4(ty, isimm)
-
-static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
- unsigned NumElts,
- MVT::SimpleValueType MemTy,
- SelectionDAG *CurDAG, SDLoc DL) {
- // Determine which inputs are registers and immediates make new operators
- // with constant values
- SmallVector<bool, 4> IsImm(NumElts, false);
- for (unsigned i = 0; i < NumElts; i++) {
- IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
- if (IsImm[i]) {
- SDValue Imm = Ops[i];
- if (MemTy == MVT::f32 || MemTy == MVT::f64) {
- const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
- const ConstantFP *CF = ConstImm->getConstantFPValue();
- Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
- } else {
- const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
- const ConstantInt *CI = ConstImm->getConstantIntValue();
- Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
- }
- Ops[i] = Imm;
- }
- }
-
- // Get opcode for MemTy, size, and register/immediate operand ordering
- switch (MemTy) {
- case MVT::i8:
- return getOpcodeForVectorStParam(NumElts, I8, IsImm);
- case MVT::i16:
- return getOpcodeForVectorStParam(NumElts, I16, IsImm);
- case MVT::i32:
- return getOpcodeForVectorStParam(NumElts, I32, IsImm);
- case MVT::i64:
- assert(NumElts == 2 && "MVT too large for NumElts > 2");
- return getOpcodeForVectorStParamV2(I64, IsImm);
- case MVT::f32:
- return getOpcodeForVectorStParam(NumElts, F32, IsImm);
- case MVT::f64:
- assert(NumElts == 2 && "MVT too large for NumElts > 2");
- return getOpcodeForVectorStParamV2(F64, IsImm);
-
- // These cases don't support immediates, just use the all register version
- // and generate moves.
- case MVT::i1:
- return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
- : NVPTX::StoreParamV4I8_rrrr;
- case MVT::f16:
- case MVT::bf16:
- return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
- : NVPTX::StoreParamV4I16_rrrr;
- case MVT::v2f16:
- case MVT::v2bf16:
- case MVT::v2i16:
- case MVT::v4i8:
- return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
- : NVPTX::StoreParamV4I32_rrrr;
- default:
- llvm_unreachable("Cannot select st.param for unknown MemTy");
- }
-}
-
-bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
- SDLoc DL(N);
- SDValue Chain = N->getOperand(0);
- SDValue Param = N->getOperand(1);
- unsigned ParamVal = Param->getAsZExtVal();
- SDValue Offset = N->getOperand(2);
- unsigned OffsetVal = Offset->getAsZExtVal();
- MemSDNode *Mem = cast<MemSDNode>(N);
- SDValue Glue = N->getOperand(N->getNumOperands() - 1);
-
- // How many elements do we have?
- unsigned NumElts;
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("Unexpected opcode");
- case NVPTXISD::StoreParam:
- NumElts = 1;
- break;
- case NVPTXISD::StoreParamV2:
- NumElts = 2;
- break;
- case NVPTXISD::StoreParamV4:
- NumElts = 4;
- break;
- }
-
- // Build vector of operands
- SmallVector<SDValue, 8> Ops;
- for (unsigned i = 0; i < NumElts; ++i)
- Ops.push_back(N->getOperand(i + 3));
- Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
- CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
- // Determine target opcode
- // If we have an i1, use an 8-bit store. The lowering code in
- // NVPTXISelLowering will have already emitted an upcast.
- std::optional<unsigned> Opcode;
- switch (NumElts) {
- default:
- llvm_unreachable("Unexpected NumElts");
- case 1: {
- MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
- SDValue Imm = Ops[0];
- if (MemTy != MVT::f16 && MemTy != MVT::bf16 &&
- (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
- // Convert immediate to target constant
- if (MemTy == MVT::f32 || MemTy == MVT::f64) {
- const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
- const ConstantFP *CF = ConstImm->getConstantFPValue();
- Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
- } else {
- const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
- const ConstantInt *CI = ConstImm->getConstantIntValue();
- Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
- }
- Ops[0] = Imm;
- // Use immediate version of store param
- Opcode =
- pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i,
- NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i);
- } else
- Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
- NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
- NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r);
- if (Opcode == NVPTX::StoreParamI8_r) {
- // Fine tune the opcode depending on the size of the operand.
- // This helps to avoid creating redundant COPY instructions in
- // InstrEmitter::AddRegisterOperand().
- switch (Ops[0].getSimpleValueType().SimpleTy) {
- default:
- break;
- case MVT::i32:
- Opcode = NVPTX::StoreParamI8TruncI32_r;
- break;
- case MVT::i64:
- Opcode = NVPTX::StoreParamI8TruncI64_r;
- break;
- }
- }
- break;
- }
- case 2:
- case 4: {
- MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
- Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
- break;
- }
- }
-
- SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
- SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
- CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
-
- ReplaceNode(N, Ret);
- return true;
-}
-
/// SelectBFE - Look for instruction sequences that can be made more efficient
/// by using the 'bfe' (bit-field extract) PTX instruction
bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 0e4dec1adca67..19b569d638b0c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -78,8 +78,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryLDG(MemSDNode *N);
bool tryStore(SDNode *N);
bool tryStoreVector(SDNode *N);
- bool tryLoadParam(SDNode *N);
- bool tryStoreParam(SDNode *N);
bool tryFence(SDNode *N);
void SelectAddrSpaceCast(SDNode *N);
bool tryBFE(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb0aeb493ed48..3cda7df55ad58 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1049,12 +1049,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
MAKE_CASE(NVPTXISD::CALL)
- MAKE_CASE(NVPTXISD::LoadParam)
- MAKE_CASE(NVPTXISD::LoadParamV2)
- MAKE_CASE(NVPTXISD::LoadParamV4)
- MAKE_CASE(NVPTXISD::StoreParam)
- MAKE_CASE(NVPTXISD::StoreParamV2)
- MAKE_CASE(NVPTXISD::StoreParamV4)
MAKE_CASE(NVPTXISD::MoveParam)
MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
MAKE_CASE(NVPTXISD::BUILD_VECTOR)
@@ -1293,105 +1287,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
return DL.getABITypeAlign(Ty);
}
-static bool adjustElementType(EVT &ElementType) {
- switch (ElementType.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::f16:
- case MVT::bf16:
- ElementType = MVT::i16;
- return true;
- case MVT::f32:
- case MVT::v2f16:
- case MVT::v2bf16:
- ElementType = MVT::i32;
- return true;
- case MVT::f64:
- ElementType = MVT::i64;
- return true;
- }
-}
-
-// Use byte-store when the param address of the argument value is unaligned.
-// This may happen when the return value is a field of a packed structure.
-//
-// This is called in LowerCall() when passing the param values.
-static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
- uint64_t Offset, EVT ElementType,
- SDValue StVal, SDValue &InGlue,
- unsigned ArgID, const SDLoc &dl) {
- // Bit logic only works on integer types
- if (adjustElementType(ElementType))
- StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
-
- // Store each byte
- SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
- // Shift the byte to the last byte position
- SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
- DAG.getConstant(i * 8, dl, MVT::i32));
- SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
- DAG.getConstant(Offset + i, dl, MVT::i32),
- ShiftVal, InGlue};
- // Trunc store only the last byte by using
- // st.param.b8
- // The register type can be larger than b8.
- Chain = DAG.getMemIntrinsicNode(
- NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
- MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
- InGlue = Chain.getValue(1);
- }
- return Chain;
-}
-
-// Use byte-load when the param adress of the returned value is unaligned.
-// This may happen when the returned value is a field of a packed structure.
-static SDValue
-LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
- EVT ElementType, SDValue &InGlue,
- SmallVectorImpl<SDValue> &TempProxyRegOps,
- const SDLoc &dl) {
- // Bit logic only works on integer types
- EVT MergedType = ElementType;
- adjustElementType(MergedType);
-
- // Load each byte and construct the whole value. Initial value to 0
- SDValue RetVal = DAG.getConstant(0, dl, MergedType);
- // LoadParamMemI8 loads into i16 register only
- SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
- for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
- SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(Offset + i, dl, MVT::i32),
- InGlue};
- // This will be selected to LoadParamMemI8
- SDValue LdVal =
- DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
- MVT::i8, MachinePointerInfo(), Align(1));
- SDValue TmpLdVal = LdVal.getValue(0);
- Chain = LdVal.getValue(1);
- InGlue = LdVal.getValue(2);
-
- TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
- TmpLdVal.getSimpleValueType(), TmpLdVal);
- TempProxyRegOps.push_back(TmpLdVal);
-
- SDValue CMask = DAG.getConstant(255, dl, MergedType);
- SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
- // Need to extend the i16 register to the whole width.
- TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
- // Mask off the high bits. Leave only the lower 8bits.
- // Do this because we are using loadparam.b8.
- TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
- // Shift and merge
- TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
- RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
- }
- if (ElementType != MergedType)
- RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
-
- return RetVal;
-}
-
static bool shouldConvertToIndirectCall(const CallBase *CB,
const GlobalAddressSDNode *Func) {
if (!Func)
@@ -1458,10 +1353,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SelectionDAG &DAG = CLI.DAG;
SDLoc dl = CLI.DL;
- SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
- SDValue Chain = CLI.Chain;
+ const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Callee = CLI.Callee;
- bool &isTailCall = CLI.IsTailCall;
ArgListTy &Args = CLI.getArgs();
Type *RetTy = CLI.RetTy;
const CallBase *CB = CLI.CB;
@@ -1492,9 +1385,34 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned VAOffset = 0; // current offset in the param array
const unsigned UniqueCallSite = GlobalUniqueCallSite++;
- SDValue TempChain = Chain;
- Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
- SDValue InGlue = Chain.getValue(1);
+ const SDValue CallChain = CLI.Chain;
+ const SDValue StartChain =
+ DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
+ SDValue DeclareGlue = StartChain.getValue(1);
+
+ SmallVector<SDValue, 16> CallPrereqs{StartChain};
+
+ const auto DeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
+ // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
+ // loaded/stored using i16, so it's handled here as well.
+ const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
+ SDValue Declare =
+ DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
+ {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
+ CallPrereqs.push_back(Declare);
+ DeclareGlue = Declare.getValue(1);
+ return Declare;
+ };
+
+ const auto DeclareArrayParam = [&](SDValue Symbol, Align Align,
+ unsigned Size) {
+ SDValue Declare = DAG.getNode(
+ NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
+ {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
+ CallPrereqs.push_back(Declare);
+ DeclareGlue = Declare.getValue(1);
+ return Declare;
+ };
// Args.size() and Outs.size() need not match.
// Outs.size() will be larger
@@ -1555,43 +1473,23 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
"type size mismatch");
- const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> {
+ const SDValue ArgDeclare = [&]() {
if (IsVAArg) {
- if (ArgI == FirstVAArg) {
- VADeclareParam = DAG.getNode(
- NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()),
- GetI32(0), InGlue});
- return VADeclareParam;
- }
- return std::nullopt;
- }
- if (IsByVal || shouldPassAsArray(Arg.Ty)) {
- // declare .param .align <align> .b8 .param<n>[<size>];
- return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(ArgAlign.value()),
- GetI32(TypeSize), InGlue});
+ if (ArgI == FirstVAArg)
+ VADeclareParam = DeclareArrayParam(
+ ParamSymbol, Align(STI.getMaxRequiredAlignment()), 0);
+ return VADeclareParam;
}
+
+ if (IsByVal || shouldPassAsArray(Arg.Ty))
+ return DeclareArrayParam(ParamSymbol, ArgAlign, TypeSize);
+
assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
- // declare .param .b<size> .param<n>;
-
- // PTX ABI requires integral types to be at least 32 bits in
- // size. FP16 is loaded/stored using i16, so it's handled
- // here as well.
- const unsigned PromotedSize =
- (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint())
- ? promoteScalarArgumentSize(TypeSize * 8)
- : TypeSize * 8;
-
- return DAG.getNode(NVPTXISD::DeclareScalarParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, ParamSymbol, GetI32(PromotedSize), InGlue});
+ assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
+ "Only int and float types are supported as non-array arguments");
+
+ return DeclareScalarParam(ParamSymbol, TypeSize);
}();
- if (ArgDeclare) {
- Chain = ArgDeclare->getValue(0);
- InGlue = ArgDeclare->getValue(1);
- }
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter
// than 32-bits are sign extended or zero extended, depending on
@@ -1601,36 +1499,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
const auto GetStoredValue = [&](const unsigned I, EVT EltVT,
- const Align PartAlign) {
- SDValue StVal;
+ const MaybeAlign PartAlign) {
if (IsByVal) {
SDValue Ptr = ArgOutVals[0];
auto MPI = refinePtrAS(Ptr, DAG, DL, *this);
SDValue SrcAddr =
DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I]));
- StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign);
- } else {
- StVal = ArgOutVals[I];
-
- auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType());
- if (PromotedVT != StVal.getValueType()) {
- StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT,
- StVal);
- }
+ return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign);
}
+ SDValue StVal = ArgOutVals[I];
+ assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
+ StVal.getValueType() &&
+ "OutVal type should always be legal");
- if (ExtendIntegerParam) {
- assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
- // zext/sext to i32
- StVal =
- DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal);
- } else if (EltVT.getSizeInBits() < 16) {
- // Use 16-bit registers for small stores as it's the
- // smallest general purpose register size supported by NVPTX.
- StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
- }
- return StVal;
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT StoreVT =
+ ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+
+ return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
};
const auto VectorInfo =
@@ -1639,23 +1526,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned J = 0;
for (const unsigned NumElts : VectorInfo) {
const int CurOffset = Offsets[J];
- EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
- const Align PartAlign = commonAlignment(ArgAlign, CurOffset);
-
- // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
- // scalar store. In such cases, fall back to byte stores.
- if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) {
-
- SDValue StVal = GetStoredValue(J, EltVT, PartAlign);
- Chain = LowerUnalignedStoreParam(DAG, Chain,
- CurOffset + (IsByVal ? VAOffset : 0),
- EltVT, StVal, InGlue, ArgI, dl);
-
- // LowerUnalignedStoreParam took care of inserting the necessary nodes
- // into the SDAG, so just move on to the next element.
- J++;
- continue;
- }
+ const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
if (IsVAArg && !IsByVal)
// Align each part of the variadic argument to their type.
@@ -1663,44 +1534,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert((IsVAArg || VAOffset == 0) &&
"VAOffset must be 0 for non-VA args");
- SmallVector<SDValue, 6> StoreOperands{
- Chain, GetI32(IsVAArg ? FirstVAArg : ArgI),
- GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))};
- // Record the values to store.
- for (const unsigned K : llvm::seq(NumElts))
- StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign));
- StoreOperands.push_back(InGlue);
+ const unsigned Offset =
+ (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset));
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreParam;
- break;
- case 2:
- Op = NVPTXISD::StoreParamV2;
- break;
- case 4:
- Op = NVPTXISD::StoreParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
+ const MaybeAlign CurrentAlign = ExtendIntegerParam
+ ? MaybeAlign(std::nullopt)
+ : commonAlignment(ArgAlign, Offset);
+
+ SDValue Val;
+ if (NumElts == 1) {
+ Val = GetStoredValue(J, EltVT, CurrentAlign);
+ } else {
+ SmallVector<SDValue, 6> StoreVals;
+ for (const unsigned K : llvm::seq(NumElts)) {
+ SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign);
+ if (ValJ.getValueType().isVector())
+ DAG.ExtractVectorElements(ValJ, StoreVals);
+ else
+ StoreVals.push_back(ValJ);
+ }
+
+ EVT VT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size());
+ Val = DAG.getBuildVector(VT, dl, StoreVals);
}
- // Adjust type of the store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
- Chain = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), PartAlign,
- MachineMemOperand::MOStore);
- InGlue = Chain.getValue(1);
+ SDValue StoreParam =
+ DAG.getStore(ArgDeclare, dl, Val, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
+ CallPrereqs.push_back(StoreParam);
// TODO: We may need to support vector types that can be passed
// as scalars in variadic arguments.
if (IsVAArg && !IsByVal) {
assert(NumElts == 1 &&
"Vectorization is expected to be disabled for variadics.");
+ const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
VAOffset +=
DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext()));
}
@@ -1715,23 +1587,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Handle Result
if (!Ins.empty()) {
- const SDValue RetDeclare = [&]() {
- const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
- const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
- if (shouldPassAsArray(RetTy)) {
- const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
- return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
- {MVT::Other, MVT::Glue},
- {Chain, RetSymbol, GetI32(RetAlign.value()),
- GetI32(ResultSize / 8), InGlue});
- }
- const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize);
- return DAG.getNode(
- NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
- {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue});
- }();
- Chain = RetDeclare.getValue(0);
- InGlue = RetDeclare.getValue(1);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
+ if (shouldPassAsArray(RetTy)) {
+ const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+ DeclareArrayParam(RetSymbol, RetAlign, ResultSize);
+ } else {
+ DeclareScalarParam(RetSymbol, ResultSize);
+ }
}
const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
@@ -1780,10 +1643,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
UniqueCallSite);
const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
- Chain = DAG.getNode(
- NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue},
- {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue});
- InGlue = Chain.getValue(1);
+ const SDValue PrototypeDeclare = DAG.getNode(
+ NVPTXISD::CallPrototype, dl, MVT::Other,
+ {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
+ CallPrereqs.push_back(PrototypeDeclare);
}
if (ConvertToIndirectCall) {
@@ -1801,24 +1664,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const unsigned NumArgs =
std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
- Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue},
- {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
- GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee,
- GetI32(Proto), InGlue});
- InGlue = Chain.getValue(1);
-
+ /// NumParams, Callee, Proto)
+ const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
+ const SDValue Call = DAG.getNode(
+ NVPTXISD::CALL, dl, MVT::Other,
+ {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
+ GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
+
+ SmallVector<SDValue, 16> LoadChains{Call};
SmallVector<SDValue, 16> ProxyRegOps;
- // An item of the vector is filled if the element does not need a ProxyReg
- // operation on it and should be added to InVals as is. ProxyRegOps and
- // ProxyRegTruncates contain empty/none items at the same index.
- SmallVector<SDValue, 16> RetElts;
- // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
- // to use the values of `LoadParam`s and to be replaced later then
- // `CALLSEQ_END` is added.
- SmallVector<SDValue, 16> TempProxyRegOps;
-
- // Generate loads from param memory/moves from registers for result
if (!Ins.empty()) {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
@@ -1835,104 +1689,66 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
unsigned I = 0;
- for (const unsigned VectorizedSize : VectorInfo) {
- EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]);
- EVT EltType = Ins[I].VT;
- const Align EltAlign = commonAlignment(RetAlign, Offsets[I]);
-
- if (TheLoadType != VTs[I])
- EltType = TheLoadType;
-
- if (ExtendIntegerRetVal) {
- TheLoadType = MVT::i32;
- EltType = MVT::i32;
- } else if (TheLoadType.getSizeInBits() < 16) {
- EltType = MVT::i16;
- }
+ for (const unsigned NumElts : VectorInfo) {
+ const MaybeAlign CurrentAlign =
+ ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
+ : commonAlignment(RetAlign, Offsets[I]);
- // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
- // scalar load. In such cases, fall back to byte loads.
- if (VectorizedSize == 1 && RetTy->isAggregateType() &&
- EltAlign < DAG.getEVTAlign(TheLoadType)) {
- SDValue Ret = LowerUnalignedLoadRetParam(
- DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl);
- ProxyRegOps.push_back(SDValue());
- RetElts.resize(I);
- RetElts.push_back(Ret);
-
- I++;
- continue;
- }
+ const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+ const EVT LoadVT =
+ ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
- SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType);
- LoadVTs.append({MVT::Other, MVT::Glue});
+ const unsigned PackingAmt =
+ LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
- NVPTXISD::NodeType Op;
- switch (VectorizedSize) {
- case 1:
- Op = NVPTXISD::LoadParam;
- break;
- case 2:
- Op = NVPTXISD::LoadParamV2;
- break;
- case 4:
- Op = NVPTXISD::LoadParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
+ const EVT VecVT = NumElts == 1 ? LoadVT
+ : EVT::getVectorVT(*DAG.getContext(),
+ LoadVT.getScalarType(),
+ NumElts * PackingAmt);
- SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue};
- SDValue RetVal = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad);
+ const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
- for (const unsigned J : llvm::seq(VectorizedSize)) {
- ProxyRegOps.push_back(RetVal.getValue(J));
- }
+ SDValue R =
+ DAG.getLoad(VecVT, dl, Call, Ptr,
+ MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
- Chain = RetVal.getValue(VectorizedSize);
- InGlue = RetVal.getValue(VectorizedSize + 1);
+ LoadChains.push_back(R.getValue(1));
- I += VectorizedSize;
+ if (NumElts == 1) {
+ ProxyRegOps.push_back(R);
+ } else {
+ for (const unsigned J : llvm::seq(NumElts)) {
+ SDValue Elt = DAG.getNode(
+ LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
+ : ISD::EXTRACT_VECTOR_ELT,
+ dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl));
+ ProxyRegOps.push_back(Elt);
+ }
+ }
+ I += NumElts;
}
}
- Chain =
- DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
- InGlue = Chain.getValue(1);
+ const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
+ const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
+ UniqueCallSite + 1, SDValue(), dl);
// Append ProxyReg instructions to the chain to make sure that `callseq_end`
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
// dangling.
- for (const unsigned I : llvm::seq(ProxyRegOps.size())) {
- if (I < RetElts.size() && RetElts[I]) {
- InVals.push_back(RetElts[I]);
- continue;
- }
-
- SDValue Ret =
- DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(),
- {Chain, ProxyRegOps[I]});
-
- const EVT ExpectedVT = Ins[I].VT;
- if (!Ret.getValueType().bitsEq(ExpectedVT)) {
- Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret);
- }
+ for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
+ SDValue Proxy =
+ DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
+ SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
InVals.push_back(Ret);
}
- for (SDValue &T : TempProxyRegOps) {
- SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(),
- {Chain, T.getOperand(0)});
- DAG.ReplaceAllUsesWith(T, Repl);
- DAG.RemoveDeadNode(T.getNode());
- }
-
- // set isTailCall to false for now, until we figure out how to express
+ // set IsTailCall to false for now, until we figure out how to express
// tail call optimization in PTX
- isTailCall = false;
- return Chain;
+ CLI.IsTailCall = false;
+ return CallEnd;
}
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
@@ -4991,10 +4807,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
Operands.push_back(DCI.DAG.getIntPtrConstant(
cast<LoadSDNode>(LD)->getExtensionType(), DL));
break;
- case NVPTXISD::LoadParamV2:
- OldNumOutputs = 2;
- Opcode = NVPTXISD::LoadParamV4;
- break;
case NVPTXISD::LoadV2:
OldNumOutputs = 2;
Opcode = NVPTXISD::LoadV4;
@@ -5066,18 +4878,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV2;
break;
- case NVPTXISD::StoreParam:
- Opcode = NVPTXISD::StoreParamV2;
- break;
- case NVPTXISD::StoreParamV2:
- Opcode = NVPTXISD::StoreParamV4;
- break;
case NVPTXISD::StoreV2:
MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV4;
break;
case NVPTXISD::StoreV4:
- case NVPTXISD::StoreParamV4:
case NVPTXISD::StoreV8:
// PTX doesn't support the next doubling of operands
return SDValue();
@@ -5122,30 +4927,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
MemVT, ST->getMemOperand());
}
-static SDValue PerformStoreCombineHelper(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- unsigned Front, unsigned Back) {
- if (all_of(N->ops().drop_front(Front).drop_back(Back),
- [](const SDUse &U) { return U.get()->isUndef(); }))
- // Operand 0 is the previous value in the chain. Cannot return EntryToken
- // as the previous value will become unused and eliminated later.
- return N->getOperand(0);
-
- return combinePackingMovIntoStore(N, DCI, Front, Back);
-}
-
static SDValue PerformStoreCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
return combinePackingMovIntoStore(N, DCI, 1, 2);
}
-static SDValue PerformStoreParamCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- // Operands from the 3rd to the 2nd last one are the values to be stored.
- // {Chain, ArgID, Offset, Val, Glue}
- return PerformStoreCombineHelper(N, DCI, 3, 1);
-}
-
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
@@ -5754,6 +5540,67 @@ static SDValue combineADDRSPACECAST(SDNode *N,
return SDValue();
}
+static SDValue sinkProxyReg(SDValue R, SDValue Chain,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ switch (R.getOpcode()) {
+ case ISD::TRUNCATE:
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::BITCAST: {
+ if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
+ return SDValue();
+ }
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::OR: {
+ if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
+ if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
+ return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
+ return SDValue();
+ }
+ case ISD::Constant:
+ return R;
+ case ISD::LOAD:
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4: {
+ return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
+ {Chain, R});
+ }
+ case ISD::BUILD_VECTOR: {
+ if (DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ SmallVector<SDValue, 16> Ops;
+ for (auto &Op : R->ops()) {
+ SDValue V = sinkProxyReg(Op, Chain, DCI);
+ if (!V)
+ return SDValue();
+ Ops.push_back(V);
+ }
+ return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
+ }
+ default:
+ return SDValue();
+ }
+}
+
+static SDValue combineProxyReg(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+
+ if (Reg.getOpcode() != ISD::LOAD) {
+ if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
+ return V;
+ }
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5775,14 +5622,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
- case NVPTXISD::LoadParamV2:
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
return combineUnpackingMovIntoLoad(N, DCI);
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N, DCI);
case ISD::STORE:
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
@@ -5795,6 +5637,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::ADDRSPACECAST:
return combineADDRSPACECAST(N, DCI);
+ case NVPTXISD::ProxyReg:
+ return combineProxyReg(N, DCI);
}
return SDValue();
}
@@ -6137,6 +5981,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
Results.push_back(NewValue.getValue(3));
}
+static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+
+ MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
+
+ SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
+ SDValue NewProxy =
+ DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
+ SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
+
+ Results.push_back(Res);
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6154,6 +6014,9 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case ISD::CopyFromReg:
ReplaceCopyFromReg_128(N, DAG, Results);
return;
+ case NVPTXISD::ProxyReg:
+ replaceProxyReg(N, DAG, *this, Results);
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2477e1fb61595..669e63c5b263e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -38,7 +38,7 @@ enum NodeType : unsigned {
/// This node represents a PTX call instruction. It's operands are as follows:
///
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
+ /// NumParams, Callee, Proto)
CALL,
MoveParam,
@@ -85,13 +85,7 @@ enum NodeType : unsigned {
StoreV2,
StoreV4,
StoreV8,
- LoadParam,
- LoadParamV2,
- LoadParamV4,
- StoreParam,
- StoreParamV2,
- StoreParamV4,
- LAST_MEMORY_OPCODE = StoreParamV4,
+ LAST_MEMORY_OPCODE = StoreV8,
};
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 441ddeeb7d667..9f55bb04772a1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1994,12 +1994,6 @@ def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
def SDTDeclareScalarParam :
SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
-def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -2011,104 +2005,20 @@ def declare_array_param :
def declare_scalar_param :
SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-def LoadParam :
- SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 :
- SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 :
- SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def StoreParam :
- SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 :
- SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 :
- SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def MoveParam :
SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
def proxy_reg :
SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
- /// NumParams, Callee, Proto, InGlue)
+ /// NumParams, Callee, Proto)
def SDTCallProfile : SDTypeProfile<0, 6,
[SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>,
SDTCisVT<3, i32>, SDTCisVT<5, i32>]>;
-def call :
- SDNode<"NVPTXISD::CALL", SDTCallProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-let mayLoad = true in {
- class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
- !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"),
- []>;
-
- class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b),
- !strconcat("ld.param.v2", opstr,
- " \t{{$dst, $dst2}}, [retval0$b];"), []>;
-
- class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
- (ins Offseti32imm:$b),
- !strconcat("ld.param.v4", opstr,
- " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"),
- []>;
-}
-
-let mayStore = true in {
-
- multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
- foreach op = [IMMType, regclass] in
- if !or(support_imm, !isa<NVPTXRegClass>(op)) then
- def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
- : NVPTXInst<(outs),
- (ins op:$val, i32imm:$a, Offseti32imm:$b),
- "st.param" # opstr # " \t[param$a$b], $val;",
- []>;
- }
-
- multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
- foreach op1 = [IMMType, regclass] in
- foreach op2 = [IMMType, regclass] in
- def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
- # !if(!isa<NVPTXRegClass>(op2), "r", "i")
- : NVPTXInst<(outs),
- (ins op1:$val1, op2:$val2,
- i32imm:$a, Offseti32imm:$b),
- "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};",
- []>;
- }
-
- multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
- foreach op1 = [IMMType, regclass] in
- foreach op2 = [IMMType, regclass] in
- foreach op3 = [IMMType, regclass] in
- foreach op4 = [IMMType, regclass] in
- def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
- # !if(!isa<NVPTXRegClass>(op2), "r", "i")
- # !if(!isa<NVPTXRegClass>(op3), "r", "i")
- # !if(!isa<NVPTXRegClass>(op4), "r", "i")
-
- : NVPTXInst<(outs),
- (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
- i32imm:$a, Offseti32imm:$b),
- "st.param.v4" # opstr #
- " \t[param$a$b], {{$val1, $val2, $val3, $val4}};",
- []>;
- }
-}
+def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-/// NumParams, Callee, Proto, InGlue)
+/// NumParams, Callee, Proto)
def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; }
@@ -2145,43 +2055,6 @@ foreach is_convergent = [0, 1] in {
(call_uni_inst $addr, imm:$rets, imm:$params)>;
}
-def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">;
-def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">;
-def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">;
-def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">;
-def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">;
-def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">;
-def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">;
-def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">;
-def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">;
-def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">;
-def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">;
-
-defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">;
-defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">;
-defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">;
-defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">;
-
-defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>;
-defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>;
-
-defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">;
-defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">;
-defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">;
-defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">;
-
-defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">;
-defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">;
-defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">;
-
-defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">;
-defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">;
-
-defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">;
-defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">;
-
-defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">;
-
def DECLARE_PARAM_array :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
".param .align $align .b8 \t$a[$size];", []>;
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index ba5813c869236..b4641d01eb927 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -208,13 +208,13 @@ define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0];
-; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r2;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1];
+; CHECK-NEXT: st.param.b32 [param1], %r2;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
index ad9e4b089e8d8..b4934e1a94d1b 100644
--- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll
+++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
@@ -13,12 +13,12 @@ define void @foo() {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.global.b64 %rd1, [G];
-; CHECK-NEXT: ld.global.b64 %rd2, [G+8];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 8 .b8 param0[16];
-; CHECK-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-NEXT: st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT: ld.global.b64 %rd1, [G+8];
+; CHECK-NEXT: st.param.b64 [param0+8], %rd1;
+; CHECK-NEXT: ld.global.b64 %rd2, [G];
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
; CHECK-NEXT: call.uni bar, (param0);
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 0cd7058174d67..0eb7f6462f6fa 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -44,11 +44,11 @@ entry:
%arrayidx7 = getelementptr inbounds [16 x i8], ptr %buf, i64 0, i64 3
store float %3, ptr %arrayidx7, align 4
-; CHECK: .param .b64 param0;
-; CHECK-NEXT: st.param.b64 [param0], %rd[[A_REG]]
-; CHECK-NEXT: .param .b64 param1;
-; CHECK-NEXT: st.param.b64 [param1], %rd[[SP_REG]]
-; CHECK-NEXT: call.uni callee,
+; CHECK-DAG: .param .b64 param0;
+; CHECK-DAG: .param .b64 param1;
+; CHECK-DAG: st.param.b64 [param0], %rd[[A_REG]]
+; CHECK-DAG: st.param.b64 [param1], %rd[[SP_REG]]
+; CHECK: call.uni callee,
call void @callee(ptr %a, ptr %buf) #2
ret void
diff --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
index f67145d78897b..483d48a1012c6 100644
--- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
@@ -14,11 +14,11 @@ target triple = "nvptx64-nvidia-cuda"
%complex_half = type { half, half }
; CHECK: .param .align 2 .b8 param2[4];
-; CHECK: st.param.b16 [param2], %rs1;
-; CHECK: st.param.b16 [param2+2], %rs2;
; CHECK: .param .align 2 .b8 retval0[4];
-; CHECK-NEXT: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]);
-; CHECK-NEXT: call (retval0),
+; CHECK-DAG: st.param.b16 [param2], %rs{{[0-9]+}};
+; CHECK-DAG: st.param.b16 [param2+2], %rs{{[0-9]+}};
+; CHECK: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]);
+; CHECK: call (retval0),
define weak_odr void @foo() {
entry:
%call.i.i.i = tail call %"class.complex" @_Z20__spirv_GroupCMulKHRjjN5__spv12complex_halfE(i32 0, i32 0, ptr byval(%"class.complex") null)
@@ -36,10 +36,10 @@ define internal void @callee(ptr byval(%"class.complex") %byval_arg) {
}
define void @boom() {
%fp = call ptr @usefp(ptr @callee)
- ; CHECK: .param .align 2 .b8 param0[4];
- ; CHECK: st.param.b16 [param0], %rs1;
- ; CHECK: st.param.b16 [param0+2], %rs2;
- ; CHECK: .callprototype ()_ (.param .align 2 .b8 _[4]);
+ ; CHECK-DAG: .param .align 2 .b8 param0[4];
+ ; CHECK-DAG: st.param.b16 [param0], %rs{{[0-9]+}};
+ ; CHECK-DAG: st.param.b16 [param0+2], %rs{{[0-9]+}};
+ ; CHECK-DAG: .callprototype ()_ (.param .align 2 .b8 _[4]);
call void %fp(ptr byval(%"class.complex") null)
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index 2232810d02128..da303b7c38eb7 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -199,10 +199,10 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
; CHECK-NEXT: add.s32 %r5, %r3, %r4;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .b32 param0;
-; CHECK-NEXT: st.param.b32 [param0], %r3;
; CHECK-NEXT: .param .b32 param1;
-; CHECK-NEXT: st.param.b32 [param1], %r5;
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: st.param.b32 [param0], %r3;
+; CHECK-NEXT: st.param.b32 [param1], %r5;
; CHECK-NEXT: call.uni (retval0), use, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r6, [retval0];
; CHECK-NEXT: } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll
index ee86fe97ef781..51cbce5a22ba0 100644
--- a/llvm/test/CodeGen/NVPTX/compare-int.ll
+++ b/llvm/test/CodeGen/NVPTX/compare-int.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
@@ -11,90 +12,180 @@
;;; i64
define i64 @icmp_eq_i64(i64 %a, i64 %b) {
-; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_eq_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_eq_i64_param_1];
+; CHECK-NEXT: setp.eq.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp eq i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_ne_i64(i64 %a, i64 %b) {
-; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ne_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ne_i64_param_1];
+; CHECK-NEXT: setp.ne.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp ne i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ugt_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ugt_i64_param_1];
+; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp ugt i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_uge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_uge_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_uge_i64_param_1];
+; CHECK-NEXT: setp.ge.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp uge i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_ult_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ult_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ult_i64_param_1];
+; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp ult i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_ule_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ule_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ule_i64_param_1];
+; CHECK-NEXT: setp.le.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp ule i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_sgt_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_sgt_i64_param_1];
+; CHECK-NEXT: setp.gt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp sgt i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_sge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_sge_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_sge_i64_param_1];
+; CHECK-NEXT: setp.ge.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp sge i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_slt_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_slt_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_slt_i64_param_1];
+; CHECK-NEXT: setp.lt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp slt i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
}
define i64 @icmp_sle_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [icmp_sle_i64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [icmp_sle_i64_param_1];
+; CHECK-NEXT: setp.le.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cmp = icmp sle i64 %a, %b
%ret = zext i1 %cmp to i64
ret i64 %ret
@@ -103,90 +194,180 @@ define i64 @icmp_sle_i64(i64 %a, i64 %b) {
;;; i32
define i32 @icmp_eq_i32(i32 %a, i32 %b) {
-; CHECK: setp.eq.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_eq_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_eq_i32_param_1];
+; CHECK-NEXT: setp.eq.s32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp eq i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_ne_i32(i32 %a, i32 %b) {
-; CHECK: setp.ne.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_ne_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_ne_i32_param_1];
+; CHECK-NEXT: setp.ne.s32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp ne i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_ugt_i32(i32 %a, i32 %b) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_ugt_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_ugt_i32_param_1];
+; CHECK-NEXT: setp.gt.u32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp ugt i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_uge_i32(i32 %a, i32 %b) {
-; CHECK: setp.ge.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_uge_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_uge_i32_param_1];
+; CHECK-NEXT: setp.ge.u32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp uge i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_ult_i32(i32 %a, i32 %b) {
-; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_ult_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_ult_i32_param_1];
+; CHECK-NEXT: setp.lt.u32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp ult i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_ule_i32(i32 %a, i32 %b) {
-; CHECK: setp.le.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_ule_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_ule_i32_param_1];
+; CHECK-NEXT: setp.le.u32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp ule i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_sgt_i32(i32 %a, i32 %b) {
-; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_sgt_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_sgt_i32_param_1];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp sgt i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_sge_i32(i32 %a, i32 %b) {
-; CHECK: setp.ge.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_sge_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_sge_i32_param_1];
+; CHECK-NEXT: setp.ge.s32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp sge i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_slt_i32(i32 %a, i32 %b) {
-; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_slt_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_slt_i32_param_1];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp slt i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
}
define i32 @icmp_sle_i32(i32 %a, i32 %b) {
-; CHECK: setp.le.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [icmp_sle_i32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [icmp_sle_i32_param_1];
+; CHECK-NEXT: setp.le.s32 %p1, %r1, %r2;
+; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%cmp = icmp sle i32 %a, %b
%ret = zext i1 %cmp to i32
ret i32 %ret
@@ -196,90 +377,190 @@ define i32 @icmp_sle_i32(i32 %a, i32 %b) {
;;; i16
define i16 @icmp_eq_i16(i16 %a, i16 %b) {
-; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_eq_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_eq_i16_param_1];
+; CHECK-NEXT: setp.eq.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp eq i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_ne_i16(i16 %a, i16 %b) {
-; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ne_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ne_i16_param_1];
+; CHECK-NEXT: setp.ne.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ne i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_ugt_i16(i16 %a, i16 %b) {
-; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ugt_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ugt_i16_param_1];
+; CHECK-NEXT: setp.gt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ugt i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_uge_i16(i16 %a, i16 %b) {
-; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_uge_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_uge_i16_param_1];
+; CHECK-NEXT: setp.ge.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp uge i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_ult_i16(i16 %a, i16 %b) {
-; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ult_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ult_i16_param_1];
+; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ult i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_ule_i16(i16 %a, i16 %b) {
-; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ule_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ule_i16_param_1];
+; CHECK-NEXT: setp.le.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ule i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_sgt_i16(i16 %a, i16 %b) {
-; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_sgt_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_sgt_i16_param_1];
+; CHECK-NEXT: setp.gt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp sgt i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_sge_i16(i16 %a, i16 %b) {
-; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_sge_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_sge_i16_param_1];
+; CHECK-NEXT: setp.ge.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp sge i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_slt_i16(i16 %a, i16 %b) {
-; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_slt_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_slt_i16_param_1];
+; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp slt i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
}
define i16 @icmp_sle_i16(i16 %a, i16 %b) {
-; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [icmp_sle_i16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [icmp_sle_i16_param_1];
+; CHECK-NEXT: setp.le.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp sle i16 %a, %b
%ret = zext i1 %cmp to i16
ret i16 %ret
@@ -290,9 +571,19 @@ define i16 @icmp_sle_i16(i16 %a, i16 %b) {
define i8 @icmp_eq_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [icmp_eq_i8_param_0];
+; CHECK-NEXT: ld.param.b8 %rs2, [icmp_eq_i8_param_1];
+; CHECK-NEXT: setp.eq.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp eq i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -300,9 +591,19 @@ define i8 @icmp_eq_i8(i8 %a, i8 %b) {
define i8 @icmp_ne_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ne_i8_param_0];
+; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ne_i8_param_1];
+; CHECK-NEXT: setp.ne.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ne i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -310,9 +611,19 @@ define i8 @icmp_ne_i8(i8 %a, i8 %b) {
define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ugt_i8_param_0];
+; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ugt_i8_param_1];
+; CHECK-NEXT: setp.gt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ugt i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -320,9 +631,19 @@ define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
define i8 @icmp_uge_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [icmp_uge_i8_param_0];
+; CHECK-NEXT: ld.param.b8 %rs2, [icmp_uge_i8_param_1];
+; CHECK-NEXT: setp.ge.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp uge i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -330,9 +651,19 @@ define i8 @icmp_uge_i8(i8 %a, i8 %b) {
define i8 @icmp_ult_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ult_i8_param_0];
+; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ult_i8_param_1];
+; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ult i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -340,9 +671,19 @@ define i8 @icmp_ult_i8(i8 %a, i8 %b) {
define i8 @icmp_ule_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ule_i8_param_0];
+; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ule_i8_param_1];
+; CHECK-NEXT: setp.le.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp ule i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -350,9 +691,19 @@ define i8 @icmp_ule_i8(i8 %a, i8 %b) {
define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.s8 %rs1, [icmp_sgt_i8_param_0];
+; CHECK-NEXT: ld.param.s8 %rs2, [icmp_sgt_i8_param_1];
+; CHECK-NEXT: setp.gt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp sgt i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -360,9 +711,19 @@ define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
define i8 @icmp_sge_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.s8 %rs1, [icmp_sge_i8_param_0];
+; CHECK-NEXT: ld.param.s8 %rs2, [icmp_sge_i8_param_1];
+; CHECK-NEXT: setp.ge.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp sge i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -370,9 +731,19 @@ define i8 @icmp_sge_i8(i8 %a, i8 %b) {
define i8 @icmp_slt_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.s8 %rs1, [icmp_slt_i8_param_0];
+; CHECK-NEXT: ld.param.s8 %rs2, [icmp_slt_i8_param_1];
+; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp slt i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
@@ -380,9 +751,19 @@ define i8 @icmp_slt_i8(i8 %a, i8 %b) {
define i8 @icmp_sle_i8(i8 %a, i8 %b) {
; Comparison happens in 16-bit
-; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.s8 %rs1, [icmp_sle_i8_param_0];
+; CHECK-NEXT: ld.param.s8 %rs2, [icmp_sle_i8_param_1];
+; CHECK-NEXT: setp.le.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%cmp = icmp sle i8 %a, %b
%ret = zext i1 %cmp to i8
ret i8 %ret
diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
index d1b478d341915..7d898b8fe771a 100644
--- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
@@ -7,52 +8,203 @@ declare i64 @callee_variadic(ptr %p, ...);
define %struct.64 @test_return_type_mismatch(ptr %p) {
; CHECK-LABEL: test_return_type_mismatch(
-; CHECK: .param .align 1 .b8 retval0[8];
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<40>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_return_type_mismatch_param_0];
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .align 1 .b8 retval0[8];
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
; CHECK-NEXT: prototype_0 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
-; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0), prototype_0;
+; CHECK-NEXT: mov.b64 %rd1, callee;
+; CHECK-NEXT: call (retval0), %rd1, (param0), prototype_0;
+; CHECK-NEXT: ld.param.b8 %rd3, [retval0+7];
+; CHECK-NEXT: ld.param.b8 %rd4, [retval0+6];
+; CHECK-NEXT: ld.param.b8 %rd5, [retval0+5];
+; CHECK-NEXT: ld.param.b8 %rd6, [retval0+4];
+; CHECK-NEXT: ld.param.b8 %rd7, [retval0+3];
+; CHECK-NEXT: ld.param.b8 %rd8, [retval0+2];
+; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1];
+; CHECK-NEXT: ld.param.b8 %rd10, [retval0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: shl.b64 %rd13, %rd9, 8;
+; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10;
+; CHECK-NEXT: shl.b64 %rd16, %rd8, 16;
+; CHECK-NEXT: shl.b64 %rd18, %rd7, 24;
+; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16;
+; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14;
+; CHECK-NEXT: shl.b64 %rd23, %rd5, 8;
+; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6;
+; CHECK-NEXT: shl.b64 %rd26, %rd4, 16;
+; CHECK-NEXT: shl.b64 %rd28, %rd3, 24;
+; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26;
+; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24;
+; CHECK-NEXT: shl.b64 %rd31, %rd30, 32;
+; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rd10;
+; CHECK-NEXT: shr.u64 %rd33, %rd32, 56;
+; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33;
+; CHECK-NEXT: shr.u64 %rd34, %rd32, 48;
+; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34;
+; CHECK-NEXT: shr.u64 %rd35, %rd32, 40;
+; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35;
+; CHECK-NEXT: shr.u64 %rd36, %rd32, 32;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36;
+; CHECK-NEXT: shr.u64 %rd37, %rd32, 24;
+; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37;
+; CHECK-NEXT: shr.u64 %rd38, %rd32, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38;
+; CHECK-NEXT: shr.u64 %rd39, %rd32, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39;
+; CHECK-NEXT: ret;
%ret = call %struct.64 @callee(ptr %p)
ret %struct.64 %ret
}
define i64 @test_param_type_mismatch(ptr %p) {
; CHECK-LABEL: test_param_type_mismatch(
-; CHECK: .param .b64 retval0;
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .b64 retval0;
; CHECK-NEXT: prototype_1 : .callprototype (.param .b64 _) _ (.param .b64 _);
-; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0), prototype_1;
+; CHECK-NEXT: st.param.b64 [param0], 7;
+; CHECK-NEXT: mov.b64 %rd1, callee;
+; CHECK-NEXT: call (retval0), %rd1, (param0), prototype_1;
+; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%ret = call i64 @callee(i64 7)
ret i64 %ret
}
define i64 @test_param_count_mismatch(ptr %p) {
; CHECK-LABEL: test_param_count_mismatch(
-; CHECK: .param .b64 retval0;
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_param_count_mismatch_param_0];
+; CHECK-NEXT: { // callseq 2, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .b64 param1;
+; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
; CHECK-NEXT: prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param .b64 _);
-; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0, param1), prototype_2;
+; CHECK-NEXT: st.param.b64 [param1], 7;
+; CHECK-NEXT: mov.b64 %rd1, callee;
+; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_2;
+; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT: } // callseq 2
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = call i64 @callee(ptr %p, i64 7)
ret i64 %ret
}
define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
; CHECK-LABEL: test_return_type_mismatch_variadic(
-; CHECK: .param .align 1 .b8 retval0[8];
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<40>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_return_type_mismatch_variadic_param_0];
+; CHECK-NEXT: { // callseq 3, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .align 1 .b8 retval0[8];
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
; CHECK-NEXT: prototype_3 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
-; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0), prototype_3;
+; CHECK-NEXT: mov.b64 %rd1, callee_variadic;
+; CHECK-NEXT: call (retval0), %rd1, (param0), prototype_3;
+; CHECK-NEXT: ld.param.b8 %rd3, [retval0+7];
+; CHECK-NEXT: ld.param.b8 %rd4, [retval0+6];
+; CHECK-NEXT: ld.param.b8 %rd5, [retval0+5];
+; CHECK-NEXT: ld.param.b8 %rd6, [retval0+4];
+; CHECK-NEXT: ld.param.b8 %rd7, [retval0+3];
+; CHECK-NEXT: ld.param.b8 %rd8, [retval0+2];
+; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1];
+; CHECK-NEXT: ld.param.b8 %rd10, [retval0];
+; CHECK-NEXT: } // callseq 3
+; CHECK-NEXT: shl.b64 %rd13, %rd9, 8;
+; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10;
+; CHECK-NEXT: shl.b64 %rd16, %rd8, 16;
+; CHECK-NEXT: shl.b64 %rd18, %rd7, 24;
+; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16;
+; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14;
+; CHECK-NEXT: shl.b64 %rd23, %rd5, 8;
+; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6;
+; CHECK-NEXT: shl.b64 %rd26, %rd4, 16;
+; CHECK-NEXT: shl.b64 %rd28, %rd3, 24;
+; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26;
+; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24;
+; CHECK-NEXT: shl.b64 %rd31, %rd30, 32;
+; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rd10;
+; CHECK-NEXT: shr.u64 %rd33, %rd32, 56;
+; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33;
+; CHECK-NEXT: shr.u64 %rd34, %rd32, 48;
+; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34;
+; CHECK-NEXT: shr.u64 %rd35, %rd32, 40;
+; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35;
+; CHECK-NEXT: shr.u64 %rd36, %rd32, 32;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36;
+; CHECK-NEXT: shr.u64 %rd37, %rd32, 24;
+; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37;
+; CHECK-NEXT: shr.u64 %rd38, %rd32, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38;
+; CHECK-NEXT: shr.u64 %rd39, %rd32, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39;
+; CHECK-NEXT: ret;
%ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p)
ret %struct.64 %ret
}
define i64 @test_param_type_mismatch_variadic(ptr %p) {
; CHECK-LABEL: test_param_type_mismatch_variadic(
-; CHECK: .param .b64 retval0;
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_param_type_mismatch_variadic_param_0];
+; CHECK-NEXT: { // callseq 4, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: st.param.b64 [param1], 7;
; CHECK-NEXT: call.uni (retval0), callee_variadic, (param0, param1);
+; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT: } // callseq 4
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
ret i64 %ret
}
define i64 @test_param_count_mismatch_variadic(ptr %p) {
; CHECK-LABEL: test_param_count_mismatch_variadic(
-; CHECK: .param .b64 retval0;
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_param_count_mismatch_variadic_param_0];
+; CHECK-NEXT: { // callseq 5, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: st.param.b64 [param1], 7;
; CHECK-NEXT: call.uni (retval0), callee_variadic, (param0, param1);
+; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT: } // callseq 5
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
ret i64 %ret
}
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 4d2ba7d00f872..06fb8d2c7c54d 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -22,8 +22,8 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
; CHECK-32-NEXT: cvta.local.u32 %r5, %r4;
; CHECK-32-NEXT: { // callseq 0, 0
; CHECK-32-NEXT: .param .b32 param0;
-; CHECK-32-NEXT: st.param.b32 [param0], %r5;
; CHECK-32-NEXT: .param .b32 retval0;
+; CHECK-32-NEXT: st.param.b32 [param0], %r5;
; CHECK-32-NEXT: call.uni (retval0), bar, (param0);
; CHECK-32-NEXT: ld.param.b32 %r6, [retval0];
; CHECK-32-NEXT: } // callseq 0
@@ -43,8 +43,8 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
; CHECK-64-NEXT: cvta.local.u64 %rd5, %rd4;
; CHECK-64-NEXT: { // callseq 0, 0
; CHECK-64-NEXT: .param .b64 param0;
-; CHECK-64-NEXT: st.param.b64 [param0], %rd5;
; CHECK-64-NEXT: .param .b32 retval0;
+; CHECK-64-NEXT: st.param.b64 [param0], %rd5;
; CHECK-64-NEXT: call.uni (retval0), bar, (param0);
; CHECK-64-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-64-NEXT: } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 43a605f2b34d7..b5637f29715e4 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -462,10 +462,10 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r2;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: st.param.b32 [param1], %r2;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 0
@@ -485,10 +485,10 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r1;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: st.param.b32 [param1], %r1;
+; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 1
@@ -508,10 +508,10 @@ define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0];
; CHECK-NEXT: { // callseq 2, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r1;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: st.param.b32 [param1], %r1;
+; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 5aa12b08a3812..87274aa759bea 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -36,10 +36,10 @@ define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
; CHECK-NEXT: fma.rn.f32 %r6, %r1, %r2, %r5;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .b32 param0;
-; CHECK-NEXT: st.param.b32 [param0], %r4;
; CHECK-NEXT: .param .b32 param1;
-; CHECK-NEXT: st.param.b32 [param1], %r6;
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: st.param.b32 [param1], %r6;
+; CHECK-NEXT: st.param.b32 [param0], %r4;
; CHECK-NEXT: call.uni (retval0), dummy_f32, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r7, [retval0];
; CHECK-NEXT: } // callseq 0
@@ -83,10 +83,10 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
; CHECK-NEXT: fma.rn.f64 %rd6, %rd1, %rd2, %rd5;
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .b64 param0;
-; CHECK-NEXT: st.param.b64 [param0], %rd4;
; CHECK-NEXT: .param .b64 param1;
-; CHECK-NEXT: st.param.b64 [param1], %rd6;
; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: st.param.b64 [param1], %rd6;
+; CHECK-NEXT: st.param.b64 [param0], %rd4;
; CHECK-NEXT: call.uni (retval0), dummy_f64, (param0, param1);
; CHECK-NEXT: ld.param.b64 %rd7, [retval0];
; CHECK-NEXT: } // callseq 1
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index ed8f6b4511079..636e12bf98943 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -64,9 +64,9 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_byval_param_0];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
+; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_byval_param_0];
; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni byval_user, (param0);
; CHECK-NEXT: } // callseq 1
diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll
index 4f4c2fe73ba7f..79abca0a1dd8e 100644
--- a/llvm/test/CodeGen/NVPTX/i128-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-param.ll
@@ -29,11 +29,11 @@ start:
; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];
; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
- ; CHECK: .param .align 16 .b8 param0[16];
- ; CHECK-NEXT: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]}
- ; CHECK: .param .align 16 .b8 param1[16];
- ; CHECK-NEXT: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]}
- ; CHECK: } // callseq [[CALLSEQ_ID]]
+ ; CHECK-DAG: .param .align 16 .b8 param0[16];
+ ; CHECK-DAG: .param .align 16 .b8 param1[16];
+ ; CHECK-DAG: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]}
+ ; CHECK-DAG: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]}
+ ; CHECK: } // callseq [[CALLSEQ_ID]]
call void @callee(i128 %0, i128 %1, ptr %2)
ret void
@@ -48,11 +48,11 @@ start:
; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]
; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
- ; CHECK: .param .align 16 .b8 param0[16];
- ; CHECK: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]}
- ; CHECK: .param .align 16 .b8 param1[16];
- ; CHECK: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]}
- ; CHECK: } // callseq [[CALLSEQ_ID]]
+ ; CHECK-DAG: .param .align 16 .b8 param0[16];
+ ; CHECK-DAG: .param .align 16 .b8 param1[16];
+ ; CHECK-DAG: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]}
+ ; CHECK-DAG: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]}
+ ; CHECK: } // callseq [[CALLSEQ_ID]]
call void @callee(i128 %0, i128 %1, ptr %2)
ret void
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index e89ab7a5605c3..c8441d52e41e1 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -642,10 +642,10 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
; COMMON-NEXT: ld.param.b32 %r1, [test_call_param_0];
; COMMON-NEXT: { // callseq 0, 0
; COMMON-NEXT: .param .align 4 .b8 param0[4];
-; COMMON-NEXT: st.param.b32 [param0], %r1;
; COMMON-NEXT: .param .align 4 .b8 param1[4];
-; COMMON-NEXT: st.param.b32 [param1], %r2;
; COMMON-NEXT: .param .align 4 .b8 retval0[4];
+; COMMON-NEXT: st.param.b32 [param1], %r2;
+; COMMON-NEXT: st.param.b32 [param0], %r1;
; COMMON-NEXT: call.uni (retval0), test_callee, (param0, param1);
; COMMON-NEXT: ld.param.b32 %r3, [retval0];
; COMMON-NEXT: } // callseq 0
@@ -665,10 +665,10 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
; COMMON-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0];
; COMMON-NEXT: { // callseq 1, 0
; COMMON-NEXT: .param .align 4 .b8 param0[4];
-; COMMON-NEXT: st.param.b32 [param0], %r2;
; COMMON-NEXT: .param .align 4 .b8 param1[4];
-; COMMON-NEXT: st.param.b32 [param1], %r1;
; COMMON-NEXT: .param .align 4 .b8 retval0[4];
+; COMMON-NEXT: st.param.b32 [param1], %r1;
+; COMMON-NEXT: st.param.b32 [param0], %r2;
; COMMON-NEXT: call.uni (retval0), test_callee, (param0, param1);
; COMMON-NEXT: ld.param.b32 %r3, [retval0];
; COMMON-NEXT: } // callseq 1
@@ -688,10 +688,10 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
; COMMON-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0];
; COMMON-NEXT: { // callseq 2, 0
; COMMON-NEXT: .param .align 4 .b8 param0[4];
-; COMMON-NEXT: st.param.b32 [param0], %r2;
; COMMON-NEXT: .param .align 4 .b8 param1[4];
-; COMMON-NEXT: st.param.b32 [param1], %r1;
; COMMON-NEXT: .param .align 4 .b8 retval0[4];
+; COMMON-NEXT: st.param.b32 [param1], %r1;
+; COMMON-NEXT: st.param.b32 [param0], %r2;
; COMMON-NEXT: call.uni (retval0), test_callee, (param0, param1);
; COMMON-NEXT: ld.param.b32 %r3, [retval0];
; COMMON-NEXT: } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
index 3edd4e4da60e0..98f94bb7b3ac1 100644
--- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
@@ -1,42 +1,107 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
-; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | FileCheck %s
-; RUN: %if ptxas %{ \
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \
-; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | %ptxas-verify -arch=sm_90 \
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs -O0 | FileCheck %s --check-prefixes=O0,COMMON
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=O3,COMMON
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs -O0 \
+; RUN: | %ptxas-verify -arch=sm_90 \
+; RUN: %}
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_90 \
; RUN: %}
+target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) {
-; CHECK-LABEL: test_bitcast_2xi8_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_bitcast_2xi8_i16_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: shl.b16 %rs3, %rs2, 8;
-; CHECK-NEXT: or.b16 %rs4, %rs1, %rs3;
-; CHECK-NEXT: cvt.u32.u16 %r2, %rs4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
+; O0-LABEL: test_bitcast_2xi8_i16(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<5>;
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_bitcast_2xi8_i16_param_0];
+; O0-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; O0-NEXT: shl.b16 %rs3, %rs2, 8;
+; O0-NEXT: or.b16 %rs4, %rs1, %rs3;
+; O0-NEXT: cvt.u32.u16 %r2, %rs4;
+; O0-NEXT: st.param.b32 [func_retval0], %r2;
+; O0-NEXT: ret;
+;
+; O3-LABEL: test_bitcast_2xi8_i16(
+; O3: {
+; O3-NEXT: .reg .b32 %r<2>;
+; O3-EMPTY:
+; O3-NEXT: // %bb.0:
+; O3-NEXT: ld.param.b16 %r1, [test_bitcast_2xi8_i16_param_0];
+; O3-NEXT: st.param.b32 [func_retval0], %r1;
+; O3-NEXT: ret;
%res = bitcast <2 x i8> %a to i16
ret i16 %res
}
define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
-; CHECK-LABEL: test_bitcast_i16_2xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
-; CHECK-NEXT: ret;
+; O0-LABEL: test_bitcast_i16_2xi8(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
+; O0-NEXT: st.param.b16 [func_retval0], %rs1;
+; O0-NEXT: ret;
+;
+; O3-LABEL: test_bitcast_i16_2xi8(
+; O3: {
+; O3-NEXT: .reg .b16 %rs<2>;
+; O3-EMPTY:
+; O3-NEXT: // %bb.0:
+; O3-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
+; O3-NEXT: st.param.b16 [func_retval0], %rs1;
+; O3-NEXT: ret;
%res = bitcast i16 %a to <2 x i8>
ret <2 x i8> %res
}
+
+define <2 x i8> @test_call_2xi8(<2 x i8> %a) {
+; O0-LABEL: test_call_2xi8(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<7>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0];
+; O0-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; O0-NEXT: { // callseq 0, 0
+; O0-NEXT: .param .align 2 .b8 param0[2];
+; O0-NEXT: .param .align 2 .b8 retval0[2];
+; O0-NEXT: st.param.v2.b8 [param0], {%rs1, %rs2};
+; O0-NEXT: call.uni (retval0), test_call_2xi8, (param0);
+; O0-NEXT: ld.param.v2.b8 {%rs3, %rs4}, [retval0];
+; O0-NEXT: } // callseq 0
+; O0-NEXT: st.param.v2.b8 [func_retval0], {%rs3, %rs4};
+; O0-NEXT: ret;
+;
+; O3-LABEL: test_call_2xi8(
+; O3: {
+; O3-NEXT: .reg .b16 %rs<7>;
+; O3-EMPTY:
+; O3-NEXT: // %bb.0:
+; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0];
+; O3-NEXT: { // callseq 0, 0
+; O3-NEXT: .param .align 2 .b8 param0[2];
+; O3-NEXT: .param .align 2 .b8 retval0[2];
+; O3-NEXT: st.param.v2.b8 [param0], {%rs1, %rs2};
+; O3-NEXT: call.uni (retval0), test_call_2xi8, (param0);
+; O3-NEXT: ld.param.v2.b8 {%rs3, %rs4}, [retval0];
+; O3-NEXT: } // callseq 0
+; O3-NEXT: st.param.v2.b8 [func_retval0], {%rs3, %rs4};
+; O3-NEXT: ret;
+ %res = call <2 x i8> @test_call_2xi8(<2 x i8> %a)
+ ret <2 x i8> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index fd2e56bb126bb..0a2dbbd11fe02 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -828,10 +828,10 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r2;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: st.param.b32 [param1], %r2;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 0
@@ -851,10 +851,10 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r1;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: st.param.b32 [param1], %r1;
+; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 1
@@ -874,10 +874,10 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0];
; CHECK-NEXT: { // callseq 2, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: .param .align 4 .b8 param1[4];
-; CHECK-NEXT: st.param.b32 [param1], %r1;
; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: st.param.b32 [param1], %r1;
+; CHECK-NEXT: st.param.b32 [param0], %r2;
; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll
index be84f9bfb1aeb..a3bf8922a98f4 100644
--- a/llvm/test/CodeGen/NVPTX/idioms.ll
+++ b/llvm/test/CodeGen/NVPTX/idioms.ll
@@ -173,8 +173,8 @@ define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: shr.s32 %r2, %r1, 16;
; CHECK-NEXT: shr.u32 %r3, %r2, 16;
-; CHECK-NEXT: st.param.b16 [func_retval0], %r2;
; CHECK-NEXT: st.param.b16 [func_retval0+2], %r3;
+; CHECK-NEXT: st.param.b16 [func_retval0], %r2;
; CHECK-NEXT: ret;
call void @escape_int(i32 %i); // Force %i to be loaded completely.
%i1 = ashr i32 %i, 16
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
index eae0321433946..782e6720e5112 100644
--- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -23,15 +23,15 @@ define internal i32 @foo() {
; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.global.b64 %rd1, [ptr];
-; CHECK-NEXT: add.u64 %rd3, %SPL, 1;
-; CHECK-NEXT: ld.local.b8 %rs1, [%rd3];
-; CHECK-NEXT: add.u64 %rd4, %SP, 0;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 1 .b8 param0[1];
-; CHECK-NEXT: st.param.b8 [param0], %rs1;
; CHECK-NEXT: .param .b64 param1;
-; CHECK-NEXT: st.param.b64 [param1], %rd4;
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: add.u64 %rd2, %SP, 0;
+; CHECK-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-NEXT: add.u64 %rd4, %SPL, 1;
+; CHECK-NEXT: ld.local.b8 %rs1, [%rd4];
+; CHECK-NEXT: st.param.b8 [param0], %rs1;
; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_0;
; CHECK-NEXT: ld.param.b32 %r1, [retval0];
@@ -60,15 +60,15 @@ define internal i32 @bar() {
; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.global.b64 %rd1, [ptr];
-; CHECK-NEXT: add.u64 %rd3, %SPL, 8;
-; CHECK-NEXT: ld.local.b64 %rd4, [%rd3];
-; CHECK-NEXT: add.u64 %rd5, %SP, 0;
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.b64 [param0], %rd4;
; CHECK-NEXT: .param .b64 param1;
-; CHECK-NEXT: st.param.b64 [param1], %rd5;
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: add.u64 %rd2, %SP, 0;
+; CHECK-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-NEXT: add.u64 %rd4, %SPL, 8;
+; CHECK-NEXT: ld.local.b64 %rd5, [%rd4];
+; CHECK-NEXT: st.param.b64 [param0], %rd5;
; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_1;
; CHECK-NEXT: ld.param.b32 %r1, [retval0];
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 321a6240df098..38185c7bf30de 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -121,20 +121,18 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0;
; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
-; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd3;
; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: st.param.b64 [param0], %rd3;
; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: call (retval0), %rd1, (param0), prototype_0;
-; PTX-NEXT: ld.param.b32 %r1, [retval0];
; PTX-NEXT: } // callseq 0
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
@@ -153,7 +151,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: .local .align 4 .b8 __local_depot4[4];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
-; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
@@ -167,18 +165,17 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: add.u64 %rd6, %SP, 0;
; PTX-NEXT: add.u64 %rd7, %SPL, 0;
; PTX-NEXT: st.local.b32 [%rd7], %r1;
-; PTX-NEXT: mov.b64 %rd1, escape3;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b64 param1;
-; PTX-NEXT: st.param.b64 [param1], %rd6;
; PTX-NEXT: .param .b64 param2;
-; PTX-NEXT: st.param.b64 [param2], %rd4;
; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: st.param.b64 [param2], %rd4;
+; PTX-NEXT: st.param.b64 [param1], %rd6;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
+; PTX-NEXT: mov.b64 %rd1, escape3;
; PTX-NEXT: call (retval0), %rd1, (param0, param1, param2), prototype_1;
-; PTX-NEXT: ld.param.b32 %r2, [retval0];
; PTX-NEXT: } // callseq 1
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
@@ -255,7 +252,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escape(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
@@ -266,14 +263,13 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escape_param_0];
; PTX-NEXT: add.s32 %r2, %r1, %r1;
; PTX-NEXT: st.global.b32 [%rd4], %r2;
-; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 2, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: call (retval0), %rd1, (param0), prototype_2;
-; PTX-NEXT: ld.param.b32 %r3, [retval0];
; PTX-NEXT: } // callseq 2
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
@@ -295,7 +291,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<5>;
+; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
@@ -307,14 +303,13 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
; PTX-NEXT: ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4];
; PTX-NEXT: st.global.b64 [%rd4], %rd5;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
-; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 3, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: call (retval0), %rd1, (param0), prototype_3;
-; PTX-NEXT: ld.param.b32 %r4, [retval0];
; PTX-NEXT: } // callseq 3
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
; PTX-NEXT: ret;
@@ -535,9 +530,9 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0];
; PTX-NEXT: { // callseq 4, 0
; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0];
; PTX-NEXT: st.param.b32 [param0], %r1;
; PTX-NEXT: call.uni device_func, (param0);
; PTX-NEXT: } // callseq 4
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index c165de7ffff03..7c029ab516d6e 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -31,7 +31,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
; PTX-LABEL: load_alignment(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
-; PTX-NEXT: .reg .b64 %rd<7>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0;
@@ -45,10 +45,9 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
; PTX-NEXT: st.b32 [%rd3], %r3;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b64 retval0;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: call.uni (retval0), escape, (param0);
-; PTX-NEXT: ld.param.b64 %rd6, [retval0];
; PTX-NEXT: } // callseq 0
; PTX-NEXT: ret;
entry:
@@ -76,17 +75,16 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
;
; PTX-LABEL: load_padding(
; PTX: {
-; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, load_padding_param_0;
; PTX-NEXT: cvta.local.u64 %rd2, %rd1;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd2;
; PTX-NEXT: .param .b64 retval0;
+; PTX-NEXT: st.param.b64 [param0], %rd2;
; PTX-NEXT: call.uni (retval0), escape, (param0);
-; PTX-NEXT: ld.param.b64 %rd3, [retval0];
; PTX-NEXT: } // callseq 1
; PTX-NEXT: ret;
%tmp = call ptr @escape(ptr nonnull align 16 %arg)
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 4784d7093a796..20a35198c3c16 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -911,9 +911,9 @@ define void @device_func(ptr byval(i32) align 4 %input) {
; PTX-NEXT: .reg .b64 %rd<2>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.b32 %r1, [device_func_param_0];
; PTX-NEXT: { // callseq 3, 0
; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: ld.param.b32 %r1, [device_func_param_0];
; PTX-NEXT: st.param.b32 [param0], %r1;
; PTX-NEXT: call.uni device_func, (param0);
; PTX-NEXT: } // callseq 3
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index 8401f457418d1..b2994c0a97585 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -8,7 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-LABEL: wombat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<11>;
-; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %bb
; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2];
@@ -19,19 +19,18 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .b64 param0;
-; CHECK-NEXT: st.param.b64 [param0], 0d0000000000000000;
; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: st.param.b64 [param0], 0;
; CHECK-NEXT: call.uni (retval0), quux, (param0);
-; CHECK-NEXT: ld.param.b64 %rd1, [retval0];
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3;
; CHECK-NEXT: or.b32 %r8, %r4, %r7;
; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8;
-; CHECK-NEXT: cvt.rn.f64.s32 %rd2, %r9;
-; CHECK-NEXT: cvt.rn.f64.u32 %rd3, %r10;
-; CHECK-NEXT: add.rn.f64 %rd4, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 %rd5, 0;
-; CHECK-NEXT: st.global.b64 [%rd5], %rd4;
+; CHECK-NEXT: cvt.rn.f64.s32 %rd1, %r9;
+; CHECK-NEXT: cvt.rn.f64.u32 %rd2, %r10;
+; CHECK-NEXT: add.rn.f64 %rd3, %rd2, %rd1;
+; CHECK-NEXT: mov.b64 %rd4, 0;
+; CHECK-NEXT: st.global.b64 [%rd4], %rd3;
; CHECK-NEXT: mov.b32 %r10, 1;
; CHECK-NEXT: bra.uni $L__BB0_1;
bb:
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index 4fa1235633cf6..c5ea9f850ea1f 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -18,16 +18,16 @@ define i32 @test(%struct.1float alignstack(32) %data) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_param_0];
-; CHECK-NEXT: shr.u32 %r2, %r1, 8;
-; CHECK-NEXT: shr.u32 %r3, %r1, 16;
-; CHECK-NEXT: shr.u32 %r4, %r1, 24;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 1 .b8 param0[4];
+; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: st.param.b8 [param0], %r1;
+; CHECK-NEXT: shr.u32 %r2, %r1, 8;
; CHECK-NEXT: st.param.b8 [param0+1], %r2;
+; CHECK-NEXT: shr.u32 %r3, %r1, 16;
; CHECK-NEXT: st.param.b8 [param0+2], %r3;
+; CHECK-NEXT: shr.u32 %r4, %r3, 8;
; CHECK-NEXT: st.param.b8 [param0+3], %r4;
-; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: call.uni (retval0), callee, (param0);
; CHECK-NEXT: ld.param.b32 %r5, [retval0];
; CHECK-NEXT: } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 6c52bfd6cbfd8..cfd5658db9089 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -27,10 +27,10 @@
; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1_param_0];
; CHECK: and.b16 [[A:%rs[0-9]+]], [[A8]], 1;
; CHECK: setp.ne.b16 %p1, [[A]], 0
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 retval0;
; CHECK: cvt.u32.u16 [[B:%r[0-9]+]], [[A8]]
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[B]]
-; CHECK: .param .b32 retval0;
+; CHECK-DAG: st.param.b32 [param0], [[B]]
; CHECK: call.uni (retval0), test_i1,
; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R8]];
@@ -47,11 +47,11 @@ define i1 @test_i1(i1 %a) {
; CHECK-NEXT: .param .b32 test_i1s_param_0
; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
+; CHECK: .param .b32 param0;
+; CHECK: .param .b32 retval0;
; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1;
; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]];
-; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0], [[A]];
-; CHECK: .param .b32 retval0;
; CHECK: call.uni
; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0];
; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1;
@@ -70,9 +70,9 @@ define signext i1 @test_i1s(i1 signext %a) {
; CHECK-DAG: ld.param.b8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v3i1_param_0]
; CHECK: .param .align 1 .b8 param0[1];
+; CHECK: .param .align 1 .b8 retval0[1];
; CHECK-DAG: st.param.b8 [param0], [[E0]];
; CHECK-DAG: st.param.b8 [param0+2], [[E2]];
-; CHECK: .param .align 1 .b8 retval0[1];
; CHECK: call.uni (retval0), test_v3i1,
; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0];
; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
@@ -89,8 +89,8 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) {
; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1]
; CHECK: ld.param.b8 [[E0:%rs[0-9]+]], [test_v4i1_param_0]
; CHECK: .param .align 1 .b8 param0[1];
-; CHECK: st.param.b8 [param0], [[E0]];
; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: st.param.b8 [param0], [[E0]];
; CHECK: call.uni (retval0), test_v4i1,
; CHECK: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0];
; CHECK: ld.param.b8 [[RE1:%rs[0-9]+]], [retval0+1];
@@ -112,9 +112,9 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) {
; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v5i1_param_0]
; CHECK: .param .align 1 .b8 param0[1];
+; CHECK: .param .align 1 .b8 retval0[1];
; CHECK-DAG: st.param.b8 [param0], [[E0]];
; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
-; CHECK: .param .align 1 .b8 retval0[1];
; CHECK: call.uni (retval0), test_v5i1,
; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0];
; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
@@ -131,8 +131,8 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) {
; CHECK-NEXT: .param .b32 test_i2_param_0
; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i2_param_0];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i2,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -147,8 +147,8 @@ define i2 @test_i2(i2 %a) {
; CHECK-NEXT: .param .b32 test_i3_param_0
; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i3_param_0];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i3,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -163,10 +163,10 @@ define i3 @test_i3(i3 %a) {
; CHECK-LABEL: test_i8(
; CHECK-NEXT: .param .b32 test_i8_param_0
; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i8_param_0];
-; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[A32]];
; CHECK: .param .b32 retval0;
+; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
+; CHECK: st.param.b32 [param0], [[A32]];
; CHECK: call.uni (retval0), test_i8,
; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R32]];
@@ -181,10 +181,10 @@ define i8 @test_i8(i8 %a) {
; CHECK-LABEL: test_i8s(
; CHECK-NEXT: .param .b32 test_i8s_param_0
; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
-; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[A]];
; CHECK: .param .b32 retval0;
+; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]];
+; CHECK: st.param.b32 [param0], [[A]];
; CHECK: call.uni (retval0), test_i8s,
; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0];
; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
@@ -202,8 +202,8 @@ define signext i8 @test_i8s(i8 signext %a) {
; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v3i8_param_0];
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.b32 [param0], [[R]]
; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: st.param.b32 [param0], [[R]]
; CHECK: call.uni (retval0), test_v3i8,
; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0];
; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very
@@ -220,8 +220,8 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) {
; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v4i8_param_0]
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.b32 [param0], [[R]];
; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: st.param.b32 [param0], [[R]];
; CHECK: call.uni (retval0), test_v4i8,
; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[RET]];
@@ -237,20 +237,13 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) {
; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v5i8_param_0]
; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v4.b8 [param0],
-; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK-DAG: st.param.b32 [param0], [[E0]];
+; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
; CHECK: call.uni (retval0), test_v5i8,
-; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0];
+; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0];
; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG: cvt.u32.u16 [[R3:%r[0-9]+]], [[RE3]];
-; CHECK-DAG: cvt.u32.u16 [[R2:%r[0-9]+]], [[RE2]];
-; CHECK-DAG: prmt.b32 [[P0:%r[0-9]+]], [[R2]], [[R3]], 0x3340U;
-; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RE1]];
-; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RE0]];
-; CHECK-DAG: prmt.b32 [[P1:%r[0-9]+]], [[R0]], [[R1]], 0x3340U;
-; CHECK-DAG: prmt.b32 [[P2:%r[0-9]+]], [[P1]], [[P0]], 0x5410U;
-; CHECK-DAG: st.param.b32 [func_retval0], [[P2]];
+; CHECK-DAG: st.param.b32 [func_retval0], {{%r[0-9]+}};
; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
; CHECK-NEXT: ret;
define <5 x i8> @test_v5i8(<5 x i8> %a) {
@@ -262,8 +255,8 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) {
; CHECK-LABEL: test_i11(
; CHECK-NEXT: .param .b32 test_i11_param_0
; CHECK: ld.param.b16 {{%rs[0-9]+}}, [test_i11_param_0];
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i11,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -277,10 +270,10 @@ define i11 @test_i11(i11 %a) {
; CHECK-LABEL: test_i16(
; CHECK-NEXT: .param .b32 test_i16_param_0
; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16_param_0];
-; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[E32]];
; CHECK: .param .b32 retval0;
+; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]];
+; CHECK: st.param.b32 [param0], [[E32]];
; CHECK: call.uni (retval0), test_i16,
; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[RE32]];
@@ -294,10 +287,10 @@ define i16 @test_i16(i16 %a) {
; CHECK-LABEL: test_i16s(
; CHECK-NEXT: .param .b32 test_i16s_param_0
; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16s_param_0];
-; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[E32]];
; CHECK: .param .b32 retval0;
+; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]];
+; CHECK: st.param.b32 [param0], [[E32]];
; CHECK: call.uni (retval0), test_i16s,
; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0];
; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]];
@@ -312,11 +305,11 @@ define signext i16 @test_i16s(i16 signext %a) {
; CHECK-LABEL: test_v3i16(
; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
-; CHECK-DAG: ld.param.v2.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
+; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v3i16_param_0];
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.v2.b16 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.b16 [param0+4], [[E2]];
; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK-DAG: st.param.b32 [param0], [[E0]];
+; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
; CHECK: call.uni (retval0), test_v3i16,
; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0];
; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4];
@@ -333,8 +326,8 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) {
; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0]
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
; CHECK: call.uni (retval0), test_v4i16,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}
@@ -348,11 +341,11 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) {
; CHECK-LABEL: test_v5i16(
; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
-; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v5i16_param_0]
; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
; CHECK: call.uni (retval0), test_v5i16,
; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0];
; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8];
@@ -369,8 +362,8 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) {
; CHECK-NEXT: .param .align 2 .b8 test_f16_param_0[2]
; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_f16_param_0];
; CHECK: .param .align 2 .b8 param0[2];
-; CHECK: st.param.b16 [param0], [[E]];
; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: st.param.b16 [param0], [[E]];
; CHECK: call.uni (retval0), test_f16,
; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0];
; CHECK: st.param.b16 [func_retval0], [[R]]
@@ -385,8 +378,8 @@ define half @test_f16(half %a) {
; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2f16_param_0];
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.b32 [param0], [[E]];
; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: st.param.b32 [param0], [[E]];
; CHECK: call.uni (retval0), test_v2f16,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R]]
@@ -401,8 +394,8 @@ define <2 x half> @test_v2f16(<2 x half> %a) {
; CHECK-NEXT: .param .align 2 .b8 test_bf16_param_0[2]
; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_bf16_param_0];
; CHECK: .param .align 2 .b8 param0[2];
-; CHECK: st.param.b16 [param0], [[E]];
; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: st.param.b16 [param0], [[E]];
; CHECK: call.uni (retval0), test_bf16,
; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0];
; CHECK: st.param.b16 [func_retval0], [[R]]
@@ -417,8 +410,8 @@ define bfloat @test_bf16(bfloat %a) {
; CHECK-NEXT: .param .align 4 .b8 test_v2bf16_param_0[4]
; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2bf16_param_0];
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.b32 [param0], [[E]];
; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: st.param.b32 [param0], [[E]];
; CHECK: call.uni (retval0), test_v2bf16,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R]]
@@ -432,12 +425,12 @@ define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) {
; CHECK:.func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v3f16(
; CHECK: .param .align 8 .b8 test_v3f16_param_0[8]
-; CHECK-DAG: ld.param.v2.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3f16_param_0];
+; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v3f16_param_0];
; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3f16_param_0+4];
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v2.b16 [param0], {[[E0]], [[E1]]};
-; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK-DAG: st.param.b32 [param0], [[E0]];
+; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
; CHECK: call.uni (retval0), test_v3f16,
; CHECK-DAG: ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0];
; CHECK-DAG: ld.param.b16 [[R2:%rs[0-9]+]], [retval0+4];
@@ -454,8 +447,8 @@ define <3 x half> @test_v3f16(<3 x half> %a) {
; CHECK: .param .align 8 .b8 test_v4f16_param_0[8]
; CHECK: ld.param.v2.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]};
; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]};
; CHECK: call.uni (retval0), test_v4f16,
; CHECK: ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0];
; CHECK: st.param.v2.b32 [func_retval0], {[[RH01]], [[RH23]]};
@@ -468,12 +461,12 @@ define <4 x half> @test_v4f16(<4 x half> %a) {
; CHECK:.func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: test_v5f16(
; CHECK: .param .align 16 .b8 test_v5f16_param_0[16]
-; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v5f16_param_0];
; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5f16_param_0+8];
; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b16 [param0],
-; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
; CHECK: call.uni (retval0), test_v5f16,
; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0];
; CHECK-DAG: ld.param.b16 [[R4:%rs[0-9]+]], [retval0+8];
@@ -490,8 +483,8 @@ define <5 x half> @test_v5f16(<5 x half> %a) {
; CHECK: .param .align 16 .b8 test_v8f16_param_0[16]
; CHECK: ld.param.v4.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
; CHECK: .param .align 16 .b8 param0[16];
-; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]};
; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]};
; CHECK: call.uni (retval0), test_v8f16,
; CHECK: ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0];
; CHECK: st.param.v4.b32 [func_retval0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
@@ -504,14 +497,14 @@ define <8 x half> @test_v8f16(<8 x half> %a) {
; CHECK:.func (.param .align 32 .b8 func_retval0[32])
; CHECK-LABEL: test_v9f16(
; CHECK: .param .align 32 .b8 test_v9f16_param_0[32]
-; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v9f16_param_0];
-; CHECK-DAG: ld.param.v4.b16 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG: ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v9f16_param_0+8];
; CHECK-DAG: ld.param.b16 [[E8:%rs[0-9]+]], [test_v9f16_param_0+16];
; CHECK: .param .align 32 .b8 param0[32];
-; CHECK-DAG: st.param.v4.b16 [param0],
-; CHECK-DAG: st.param.v4.b16 [param0+8],
-; CHECK-DAG: st.param.b16 [param0+16], [[E8]];
; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK-DAG: st.param.b16 [param0+16], [[E8]];
; CHECK: call.uni (retval0), test_v9f16,
; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0];
; CHECK-DAG: ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8];
@@ -531,8 +524,8 @@ define <9 x half> @test_v9f16(<9 x half> %a) {
; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0];
; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i19,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -548,8 +541,8 @@ define i19 @test_i19(i19 %a) {
; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0];
; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i23,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -565,8 +558,8 @@ define i23 @test_i23(i23 %a) {
; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2];
; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i24,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -581,8 +574,8 @@ define i24 @test_i24(i24 %a) {
; CHECK-NEXT: .param .b32 test_i29_param_0
; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i29_param_0];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), test_i29,
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}};
@@ -597,8 +590,8 @@ define i29 @test_i29(i29 %a) {
; CHECK-NEXT: .param .b32 test_i32_param_0
; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_i32_param_0];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[E]];
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], [[E]];
; CHECK: call.uni (retval0), test_i32,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R]];
@@ -613,10 +606,10 @@ define i32 @test_i32(i32 %a) {
; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.b32 [param0+8], [[E2]];
-; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: .param .align 16 .b8 param0[16];
+; CHECK-DAG: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
; CHECK: call.uni (retval0), test_v3i32,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
@@ -632,9 +625,9 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) {
; CHECK-LABEL: test_v4i32(
; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
; CHECK: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: .param .align 16 .b8 param0[16];
+; CHECK-DAG: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK: call.uni (retval0), test_v4i32,
; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0];
; CHECK: st.param.v4.b32 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
@@ -650,9 +643,9 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) {
; CHECK-DAG: ld.param.b32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
; CHECK-DAG: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
; CHECK: .param .align 32 .b8 param0[32];
+; CHECK: .param .align 32 .b8 retval0[32];
; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK-DAG: st.param.b32 [param0+16], [[E4]];
-; CHECK: .param .align 32 .b8 retval0[32];
; CHECK: call.uni (retval0), test_v5i32,
; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0];
; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
@@ -669,8 +662,8 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) {
; CHECK-NEXT: .param .b32 test_f32_param_0
; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_f32_param_0];
; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0], [[E]];
; CHECK: .param .b32 retval0;
+; CHECK: st.param.b32 [param0], [[E]];
; CHECK: call.uni (retval0), test_f32,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R]];
@@ -686,8 +679,8 @@ define float @test_f32(float %a) {
; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4];
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), test_i40,
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}};
@@ -703,8 +696,8 @@ define i40 @test_i40(i40 %a) {
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4];
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), test_i47,
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}};
@@ -720,8 +713,8 @@ define i47 @test_i47(i47 %a) {
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4];
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), test_i48,
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}};
@@ -738,8 +731,8 @@ define i48 @test_i48(i48 %a) {
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4];
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), test_i51,
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}};
@@ -756,8 +749,8 @@ define i51 @test_i51(i51 %a) {
; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4];
; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), test_i56,
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}};
@@ -772,8 +765,8 @@ define i56 @test_i56(i56 %a) {
; CHECK-NEXT: .param .b64 test_i57_param_0
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i57_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), test_i57,
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}};
@@ -788,8 +781,8 @@ define i57 @test_i57(i57 %a) {
; CHECK-NEXT: .param .b64 test_i64_param_0
; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_i64_param_0];
; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0], [[E]];
; CHECK: .param .b64 retval0;
+; CHECK: st.param.b64 [param0], [[E]];
; CHECK: call.uni (retval0), test_i64,
; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0];
; CHECK: st.param.b64 [func_retval0], [[R]];
@@ -805,9 +798,9 @@ define i64 @test_i64(i64 %a) {
; CHECK-DAG: ld.param.b64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
; CHECK: .param .align 32 .b8 param0[32];
-; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.b64 [param0+16], [[E2]];
; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK-DAG: st.param.v2.b64 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b64 [param0+16], [[E2]];
; CHECK: call.uni (retval0), test_v3i64,
; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0];
; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16];
@@ -828,9 +821,9 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) {
; CHECK-DAG: ld.param.v2.b64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
; CHECK: .param .align 32 .b8 param0[32];
-; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]};
; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK-DAG: st.param.v2.b64 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]};
; CHECK: call.uni (retval0), test_v4i64,
; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0];
; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
@@ -849,8 +842,8 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) {
; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
; CHECK: .param .align 1 .b8 param0[1];
-; CHECK: st.param.b8 [param0], [[A]]
; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: st.param.b8 [param0], [[A]]
; CHECK: call.uni (retval0), test_s_i1,
; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0];
; CHECK: st.param.b8 [func_retval0], [[R]];
@@ -865,8 +858,8 @@ define %s_i1 @test_s_i1(%s_i1 %a) {
; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
; CHECK: .param .align 1 .b8 param0[1];
-; CHECK: st.param.b8 [param0], [[A]]
; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: st.param.b8 [param0], [[A]]
; CHECK: call.uni (retval0), test_s_i8,
; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0];
; CHECK: st.param.b8 [func_retval0], [[R]];
@@ -881,8 +874,8 @@ define %s_i8 @test_s_i8(%s_i8 %a) {
; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
; CHECK: .param .align 2 .b8 param0[2];
-; CHECK: st.param.b16 [param0], [[A]]
; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: st.param.b16 [param0], [[A]]
; CHECK: call.uni (retval0), test_s_i16,
; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0];
; CHECK: st.param.b16 [func_retval0], [[R]];
@@ -897,8 +890,8 @@ define %s_i16 @test_s_i16(%s_i16 %a) {
; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_f16_param_0];
; CHECK: .param .align 2 .b8 param0[2];
-; CHECK: st.param.b16 [param0], [[A]]
; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: st.param.b16 [param0], [[A]]
; CHECK: call.uni (retval0), test_s_f16,
; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0];
; CHECK: st.param.b16 [func_retval0], [[R]];
@@ -913,8 +906,8 @@ define %s_f16 @test_s_f16(%s_f16 %a) {
; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_i32_param_0];
; CHECK: .param .align 4 .b8 param0[4]
-; CHECK: st.param.b32 [param0], [[E]];
; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: st.param.b32 [param0], [[E]];
; CHECK: call.uni (retval0), test_s_i32,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R]];
@@ -929,8 +922,8 @@ define %s_i32 @test_s_i32(%s_i32 %a) {
; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_f32_param_0];
; CHECK: .param .align 4 .b8 param0[4]
-; CHECK: st.param.b32 [param0], [[E]];
; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: st.param.b32 [param0], [[E]];
; CHECK: call.uni (retval0), test_s_f32,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0];
; CHECK: st.param.b32 [func_retval0], [[R]];
@@ -945,8 +938,8 @@ define %s_f32 @test_s_f32(%s_f32 %a) {
; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_s_i64_param_0];
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.b64 [param0], [[E]];
; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: st.param.b64 [param0], [[E]];
; CHECK: call.uni (retval0), test_s_i64,
; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0];
; CHECK: st.param.b64 [func_retval0], [[R]];
@@ -966,12 +959,12 @@ define %s_i64 @test_s_i64(%s_i64 %a) {
; CHECK-DAG: ld.param.b32 [[E1:%r[0-9]+]], [test_s_i32f32_param_0+4];
; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
; CHECK: .param .align 8 .b8 param0[24];
+; CHECK: .param .align 8 .b8 retval0[24];
; CHECK-DAG: st.param.b32 [param0], [[E0]];
; CHECK-DAG: st.param.b32 [param0+4], [[E1]];
; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
; CHECK-DAG: st.param.b32 [param0+12], [[E3]];
; CHECK-DAG: st.param.b64 [param0+16], [[E4]];
-; CHECK: .param .align 8 .b8 retval0[24];
; CHECK: call.uni (retval0), test_s_i32f32,
; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0];
; CHECK-DAG: ld.param.b32 [[RE1:%r[0-9]+]], [retval0+4];
@@ -997,10 +990,10 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
; CHECK-DAG: ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
; CHECK: .param .align 8 .b8 param0[24];
-; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
-; CHECK: st.param.b64 [param0+16], [[E4]];
; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK-DAG: st.param.b64 [param0+16], [[E4]];
; CHECK: call.uni (retval0), test_s_i32x4,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
@@ -1024,16 +1017,13 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
; CHECK: ld.param.b8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
; CHECK: .param .align 8 .b8 param0[32];
-; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.b8 [param0+8], [[E2]];
-; CHECK: st.param.b32 [param0+12], [[E3]];
-; CHECK: st.param.b32 [param0+16], [[E4]];
-; CHECK: st.param.b64 [param0+24], [[E5]];
; CHECK: .param .align 8 .b8 retval0[32];
-; CHECK: call.uni (retval0), test_s_i1i32x4,
-; CHECK: (
-; CHECK: param0
-; CHECK: );
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b8 [param0+8], [[E2]];
+; CHECK-DAG: st.param.b32 [param0+12], [[E3]];
+; CHECK-DAG: st.param.b32 [param0+16], [[E4]];
+; CHECK-DAG: st.param.b64 [param0+24], [[E5]];
+; CHECK: call.uni (retval0), test_s_i1i32x4, (param0);
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8];
; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12];
@@ -1082,6 +1072,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+1];
; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0];
; CHECK: .param .align 1 .b8 param0[25];
+; CHECK: .param .align 1 .b8 retval0[25];
; CHECK-DAG: st.param.b8 [param0],
; CHECK-DAG: st.param.b8 [param0+1],
; CHECK-DAG: st.param.b8 [param0+2],
@@ -1107,33 +1098,32 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
; CHECK-DAG: st.param.b8 [param0+22],
; CHECK-DAG: st.param.b8 [param0+23],
; CHECK-DAG: st.param.b8 [param0+24],
-; CHECK: .param .align 1 .b8 retval0[25];
-; CHECK: call.uni (retval0), test_s_i1i32x4p,
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
-; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK: call.uni (retval0), test_s_i1i32x4p, (param0);
+; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+3];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+2];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+1];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+7];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+6];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+5];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+4];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+12];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+11];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+10];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+9];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+16];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+15];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+14];
+; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+13];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+24];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+23];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+22];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+21];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+20];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+19];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+18];
+; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+17];
; CHECK: } // callseq
; CHECK-DAG: st.param.b8 [func_retval0],
; CHECK-DAG: st.param.b8 [func_retval0+1],
@@ -1177,13 +1167,13 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
; CHECK: ld.param.b32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
; CHECK: .param .align 16 .b8 param0[80];
-; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK: st.param.b32 [param0+8], [[E2]];
-; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
-; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
-; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
-; CHECK: st.param.b32 [param0+64], [[E15]];
; CHECK: .param .align 16 .b8 retval0[80];
+; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
+; CHECK-DAG: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
+; CHECK-DAG: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
+; CHECK-DAG: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
+; CHECK-DAG: st.param.b32 [param0+64], [[E15]];
; CHECK: call.uni (retval0), test_s_crossfield,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll
index 88ad0b0a9f9d1..2155fb4031c36 100644
--- a/llvm/test/CodeGen/NVPTX/param-overalign.ll
+++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll
@@ -28,8 +28,8 @@ define float @caller_md(float %a, float %b) {
; CHECK-NEXT: ld.param.b32 %r2, [caller_md_param_1];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
; CHECK-NEXT: call.uni (retval0), callee_md, (param0);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 0
@@ -69,8 +69,8 @@ define float @caller(float %a, float %b) {
; CHECK-NEXT: ld.param.b32 %r2, [caller_param_1];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
; CHECK-NEXT: call.uni (retval0), callee, (param0);
; CHECK-NEXT: ld.param.b32 %r3, [retval0];
; CHECK-NEXT: } // callseq 1
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index a480984a538b3..a592b82614f43 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -84,8 +84,8 @@ define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x1_param_1
; CHECK: )
; CHECK: .param .b32 param0;
- ; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[4];
+ ; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: call.uni (retval0), callee_St4x1, (param0);
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
%1 = load i32, ptr %in, align 4
@@ -112,8 +112,8 @@ define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x2_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[8];
- ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[8];
+ ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: call.uni (retval0), callee_St4x2, (param0);
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
%agg.tmp = alloca %struct.St4x2, align 8
@@ -149,9 +149,9 @@ define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x3_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[12];
+ ; CHECK: .param .align 16 .b8 retval0[12];
; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.b32 [param0+8], {{%r[0-9]+}};
- ; CHECK: .param .align 16 .b8 retval0[12];
; CHECK: call.uni (retval0), callee_St4x3, (param0);
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+8];
@@ -193,8 +193,8 @@ define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x4_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[16];
- ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[16];
+ ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: call.uni (retval0), callee_St4x4, (param0);
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
%call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2
@@ -239,9 +239,9 @@ define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x5_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[20];
+ ; CHECK: .param .align 16 .b8 retval0[20];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.b32 [param0+16], {{%r[0-9]+}};
- ; CHECK: .param .align 16 .b8 retval0[20];
; CHECK: call.uni (retval0), callee_St4x5, (param0);
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+16];
@@ -295,9 +295,9 @@ define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x6_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[24];
+ ; CHECK: .param .align 16 .b8 retval0[24];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
- ; CHECK: .param .align 16 .b8 retval0[24];
; CHECK: call.uni (retval0), callee_St4x6, (param0);
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
@@ -357,10 +357,10 @@ define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x7_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[28];
+ ; CHECK: .param .align 16 .b8 retval0[28];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.b32 [param0+24], {{%r[0-9]+}};
- ; CHECK: .param .align 16 .b8 retval0[28];
; CHECK: call.uni (retval0), callee_St4x7, (param0);
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
@@ -429,9 +429,9 @@ define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St4x8_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[32];
- ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
- ; CHECK: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[32];
+ ; CHECK-DAG: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+ ; CHECK-DAG: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: call.uni (retval0), callee_St4x8, (param0);
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
@@ -503,8 +503,8 @@ define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St8x1_param_1
; CHECK: )
; CHECK: .param .b64 param0;
- ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[8];
+ ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: call.uni (retval0), callee_St8x1, (param0);
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
%1 = load i64, ptr %in, align 8
@@ -531,8 +531,8 @@ define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St8x2_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[16];
- ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[16];
+ ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: call.uni (retval0), callee_St8x2, (param0);
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
%call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2
@@ -565,9 +565,9 @@ define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St8x3_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[24];
+ ; CHECK: .param .align 16 .b8 retval0[24];
; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: st.param.b64 [param0+16], {{%rd[0-9]+}};
- ; CHECK: .param .align 16 .b8 retval0[24];
; CHECK: call.uni (retval0), callee_St8x3, (param0);
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+16];
@@ -609,9 +609,9 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
; CHECK: .param .b64 caller_St8x4_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[32];
- ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
- ; CHECK: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[32];
+ ; CHECK-DAG: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+ ; CHECK-DAG: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: call.uni (retval0), callee_St8x4, (param0);
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
index 5d0d6f6ecd5ff..4a5315203dba1 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
@@ -77,7 +77,7 @@ constants: []
machineFunctionInfo: {}
body: |
bb.0:
- %0:b32, %1:b32, %2:b32, %3:b32 = LoadParamMemV4I32 0
+ %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, &retval0, 0 :: (load (s128), addrspace 101)
; CHECK-NOT: ProxyReg
%4:b32 = ProxyRegB32 killed %0
%5:b32 = ProxyRegB32 killed %1
@@ -86,7 +86,7 @@ body: |
; CHECK: STV_i32_v4 killed %0, killed %1, killed %2, killed %3
STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s128), addrspace 101)
- %8:b32 = LoadParamMemI32 0
+ %8:b32 = LD_i32 0, 0, 101, 3, 32, &retval0, 0 :: (load (s32), addrspace 101)
; CHECK-NOT: ProxyReg
%9:b32 = ProxyRegB32 killed %8
%10:b32 = ProxyRegB32 killed %9
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index 6aa111932a4a5..55f482f1aeb61 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -26,8 +26,8 @@ define void @st_param_i8_i16() {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 2 .b8 param0[4];
-; CHECK-NEXT: st.param.b8 [param0], 1;
; CHECK-NEXT: st.param.b16 [param0+2], 2;
+; CHECK-NEXT: st.param.b8 [param0], 1;
; CHECK-NEXT: call.uni call_i8_i16, (param0);
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: ret;
@@ -75,7 +75,7 @@ define void @st_param_f32() {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: { // callseq 3, 0
; CHECK-NEXT: .param .b32 param0;
-; CHECK-NEXT: st.param.b32 [param0], 0f40A00000;
+; CHECK-NEXT: st.param.b32 [param0], 1084227584;
; CHECK-NEXT: call.uni call_f32, (param0);
; CHECK-NEXT: } // callseq 3
; CHECK-NEXT: ret;
@@ -91,7 +91,7 @@ define void @st_param_f64() {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: { // callseq 4, 0
; CHECK-NEXT: .param .b64 param0;
-; CHECK-NEXT: st.param.b64 [param0], 0d4018000000000000;
+; CHECK-NEXT: st.param.b64 [param0], 4618441417868443648;
; CHECK-NEXT: call.uni call_f64, (param0);
; CHECK-NEXT: } // callseq 4
; CHECK-NEXT: ret;
@@ -165,7 +165,7 @@ define void @st_param_v2_i16_ii() {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: { // callseq 8, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v2.b16 [param0], {1, 2};
+; CHECK-NEXT: st.param.b32 [param0], 131073;
; CHECK-NEXT: call.uni call_v2_i16, (param0);
; CHECK-NEXT: } // callseq 8
; CHECK-NEXT: ret;
@@ -432,7 +432,7 @@ define void @st_param_v4_i8_iiii() {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: { // callseq 23, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, 4};
+; CHECK-NEXT: st.param.b32 [param0], 67305985;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 23
; CHECK-NEXT: ret;
@@ -442,15 +442,19 @@ define void @st_param_v4_i8_iiii() {
define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
; CHECK-LABEL: st_param_v4_i8_irrr(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_2];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1];
-; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irrr_param_2];
+; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_irrr_param_1];
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT: mov.b32 %r5, 1;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 24, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs3, %rs2, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r7;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 24
; CHECK-NEXT: ret;
@@ -464,15 +468,18 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
; CHECK-LABEL: st_param_v4_i8_rirr(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_2];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1];
-; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rirr_param_2];
+; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rirr_param_1];
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT: prmt.b32 %r5, %r4, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 25, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, 2, %rs2, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 25
; CHECK-NEXT: ret;
@@ -486,15 +493,19 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
; CHECK-LABEL: st_param_v4_i8_rrir(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_2];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1];
-; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rrir_param_2];
+; CHECK-NEXT: mov.b32 %r2, 3;
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_rrir_param_1];
+; CHECK-NEXT: ld.param.b8 %r5, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 26, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, 3, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r7;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 26
; CHECK-NEXT: ret;
@@ -508,15 +519,18 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
; CHECK-LABEL: st_param_v4_i8_rrri(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_2];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1];
-; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rrri_param_1];
+; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_rrri_param_2];
+; CHECK-NEXT: prmt.b32 %r5, %r4, 4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r3, %r5, 0x5410U;
; CHECK-NEXT: { // callseq 27, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, %rs1, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 27
; CHECK-NEXT: ret;
@@ -530,14 +544,18 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
; CHECK-LABEL: st_param_v4_i8_iirr(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_1];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_iirr_param_1];
+; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_iirr_param_0];
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 28, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs2, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 28
; CHECK-NEXT: ret;
@@ -551,14 +569,19 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
; CHECK-LABEL: st_param_v4_i8_irir(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_1];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irir_param_1];
+; CHECK-NEXT: mov.b32 %r2, 3;
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_irir_param_0];
+; CHECK-NEXT: mov.b32 %r5, 1;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 29, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, 3, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r7;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 29
; CHECK-NEXT: ret;
@@ -572,14 +595,18 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
; CHECK-LABEL: st_param_v4_i8_irri(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_1];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irri_param_0];
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_irri_param_1];
+; CHECK-NEXT: prmt.b32 %r5, %r4, 4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r3, %r5, 0x5410U;
; CHECK-NEXT: { // callseq 30, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, %rs1, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 30
; CHECK-NEXT: ret;
@@ -593,14 +620,18 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
; CHECK-LABEL: st_param_v4_i8_riir(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_1];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_riir_param_1];
+; CHECK-NEXT: mov.b32 %r2, 3;
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_riir_param_0];
+; CHECK-NEXT: prmt.b32 %r5, %r4, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 31, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, 3, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 31
; CHECK-NEXT: ret;
@@ -614,14 +645,17 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
; CHECK-LABEL: st_param_v4_i8_riri(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_1];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_riri_param_1];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 4, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r3, [st_param_v4_i8_riri_param_0];
+; CHECK-NEXT: prmt.b32 %r4, %r3, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U;
; CHECK-NEXT: { // callseq 32, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, %rs1, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r5;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 32
; CHECK-NEXT: ret;
@@ -635,14 +669,18 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
; CHECK-LABEL: st_param_v4_i8_rrii(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_1];
-; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rrii_param_1];
+; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rrii_param_0];
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: mov.b32 %r4, 3;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r3, %r5, 0x5410U;
; CHECK-NEXT: { // callseq 33, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, %rs1, 3, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 33
; CHECK-NEXT: ret;
@@ -656,13 +694,18 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
define void @st_param_v4_i8_iiir(i8 %d) {
; CHECK-LABEL: st_param_v4_i8_iiir(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iiir_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_iiir_param_0];
+; CHECK-NEXT: mov.b32 %r2, 3;
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: mov.b32 %r4, 1;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
; CHECK-NEXT: { // callseq 34, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, %rs1};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 34
; CHECK-NEXT: ret;
@@ -676,13 +719,17 @@ define void @st_param_v4_i8_iiir(i8 %d) {
define void @st_param_v4_i8_iiri(i8 %c) {
; CHECK-LABEL: st_param_v4_i8_iiri(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iiri_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_iiri_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 4, 0x3340U;
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: prmt.b32 %r4, %r3, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U;
; CHECK-NEXT: { // callseq 35, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r5;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 35
; CHECK-NEXT: ret;
@@ -696,13 +743,18 @@ define void @st_param_v4_i8_iiri(i8 %c) {
define void @st_param_v4_i8_irii(i8 %b) {
; CHECK-LABEL: st_param_v4_i8_irii(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irii_param_0];
+; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irii_param_0];
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT: mov.b32 %r4, 3;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 4, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r6, %r3, %r5, 0x5410U;
; CHECK-NEXT: { // callseq 36, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r6;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 36
; CHECK-NEXT: ret;
@@ -716,13 +768,17 @@ define void @st_param_v4_i8_irii(i8 %b) {
define void @st_param_v4_i8_riii(i8 %a) {
; CHECK-LABEL: st_param_v4_i8_riii(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riii_param_0];
+; CHECK-NEXT: mov.b32 %r1, 3;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 4, 0x3340U;
+; CHECK-NEXT: ld.param.b8 %r3, [st_param_v4_i8_riii_param_0];
+; CHECK-NEXT: prmt.b32 %r4, %r3, 2, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U;
; CHECK-NEXT: { // callseq 37, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
-; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, 4};
+; CHECK-NEXT: st.param.b32 [param0], %r5;
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 37
; CHECK-NEXT: ret;
@@ -742,7 +798,7 @@ define void @st_param_v4_i16_iiii() {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: { // callseq 38, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, 4};
+; CHECK-NEXT: st.param.v2.b32 [param0], {131073, 262147};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 38
; CHECK-NEXT: ret;
@@ -841,13 +897,15 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) {
; CHECK-LABEL: st_param_v4_i16_iirr(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iirr_param_0];
; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_iirr_param_1];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: { // callseq 43, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, %rs2};
+; CHECK-NEXT: st.param.v2.b32 [param0], {131073, %r1};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 43
; CHECK-NEXT: ret;
@@ -946,13 +1004,15 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
; CHECK-LABEL: st_param_v4_i16_rrii(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_rrii_param_0];
; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_rrii_param_1];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: { // callseq 48, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4};
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 262147};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 48
; CHECK-NEXT: ret;
@@ -966,13 +1026,16 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
define void @st_param_v4_i16_iiir(i16 %d) {
; CHECK-LABEL: st_param_v4_i16_iiir(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iiir_param_0];
+; CHECK-NEXT: mov.b16 %rs2, 3;
+; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
; CHECK-NEXT: { // callseq 49, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, %rs1};
+; CHECK-NEXT: st.param.v2.b32 [param0], {131073, %r1};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 49
; CHECK-NEXT: ret;
@@ -986,13 +1049,16 @@ define void @st_param_v4_i16_iiir(i16 %d) {
define void @st_param_v4_i16_iiri(i16 %c) {
; CHECK-LABEL: st_param_v4_i16_iiri(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iiri_param_0];
+; CHECK-NEXT: mov.b16 %rs2, 4;
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: { // callseq 50, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, 4};
+; CHECK-NEXT: st.param.v2.b32 [param0], {131073, %r1};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 50
; CHECK-NEXT: ret;
@@ -1006,13 +1072,16 @@ define void @st_param_v4_i16_iiri(i16 %c) {
define void @st_param_v4_i16_irii(i16 %b) {
; CHECK-LABEL: st_param_v4_i16_irii(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_irii_param_0];
+; CHECK-NEXT: mov.b16 %rs2, 1;
+; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
; CHECK-NEXT: { // callseq 51, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, 3, 4};
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 262147};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 51
; CHECK-NEXT: ret;
@@ -1026,13 +1095,16 @@ define void @st_param_v4_i16_irii(i16 %b) {
define void @st_param_v4_i16_riii(i16 %a) {
; CHECK-LABEL: st_param_v4_i16_riii(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_riii_param_0];
+; CHECK-NEXT: mov.b16 %rs2, 2;
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: { // callseq 52, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, 3, 4};
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 262147};
; CHECK-NEXT: call.uni call_v4_i16, (param0);
; CHECK-NEXT: } // callseq 52
; CHECK-NEXT: ret;
@@ -1672,13 +1744,12 @@ declare void @call_v4_f32(%struct.float4 alignstack(16))
define void @st_param_bfloat() {
; CHECK-LABEL: st_param_bfloat(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-EMPTY:
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.b16 %rs1, 0x4100;
; CHECK-NEXT: { // callseq 83, 0
; CHECK-NEXT: .param .align 2 .b8 param0[2];
-; CHECK-NEXT: st.param.b16 [param0], %rs1;
+; CHECK-NEXT: st.param.b16 [param0], 0x4100;
; CHECK-NEXT: call.uni call_bfloat, (param0);
; CHECK-NEXT: } // callseq 83
; CHECK-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 5b31b5e24bc68..c8ca6b6a3efa1 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -34,9 +34,9 @@ define void @test_store_param_def(i64 %param0, i32 %param1) {
; CHECK-NEXT: ld.param.b32 %r1, [test_store_param_def_param_1];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 16 .b8 param0[32];
+; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r2, %r1, %r3, %r4};
+; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r5, %r1};
; CHECK-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r2, %r1};
-; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5};
; CHECK-NEXT: call.uni test_call, (param0);
; CHECK-NEXT: } // callseq 1
; CHECK-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index d6961a9541776..3138d7c4c14db 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -69,8 +69,8 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
; CHECK-NEXT: tex.1d.v4.f32.s32 {%r2, %r3, %r4, %r5}, [tex0, {%r1}];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .b64 param0;
-; CHECK-NEXT: st.param.b64 [param0], %rd3;
; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: st.param.b64 [param0], %rd3;
; CHECK-NEXT: call.uni (retval0), texfunc, (param0);
; CHECK-NEXT: ld.param.b32 %r6, [retval0];
; CHECK-NEXT: } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index 87e46b1505e31..697eb90fb1740 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Verifies correctness of load/store of parameters and return values.
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | %ptxas-verify %}
%s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
%s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
@@ -24,37 +24,35 @@
define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
; CHECK-LABEL: test_s_i8i16p(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<15>;
+; CHECK-NEXT: .reg .b16 %rs<13>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs4, [test_s_i8i16p_param_0+4];
-; CHECK-NEXT: shl.b16 %rs5, %rs4, 8;
-; CHECK-NEXT: ld.param.b8 %rs6, [test_s_i8i16p_param_0+3];
-; CHECK-NEXT: or.b16 %rs3, %rs5, %rs6;
+; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i16p_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i16p_param_0+8];
-; CHECK-NEXT: ld.param.b8 %rs2, [test_s_i8i16p_param_0+2];
-; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8i16p_param_0];
+; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8i16p_param_0+4];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 8 .b8 param0[16];
-; CHECK-NEXT: st.param.b16 [param0], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+2], %rs2;
-; CHECK-NEXT: st.param.b8 [param0+3], %rs3;
-; CHECK-NEXT: st.param.b8 [param0+4], %rs4;
-; CHECK-NEXT: st.param.b64 [param0+8], %rd1;
; CHECK-NEXT: .param .align 8 .b8 retval0[16];
+; CHECK-NEXT: st.param.b8 [param0+4], %rs1;
+; CHECK-NEXT: st.param.b64 [param0+8], %rd1;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_s_i8i16p, (param0);
-; CHECK-NEXT: ld.param.b16 %rs7, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs8, [retval0+2];
-; CHECK-NEXT: ld.param.b8 %rs9, [retval0+3];
-; CHECK-NEXT: ld.param.b8 %rs10, [retval0+4];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+8];
+; CHECK-NEXT: ld.param.b8 %rs2, [retval0+2];
+; CHECK-NEXT: ld.param.b16 %rs3, [retval0];
+; CHECK-NEXT: ld.param.b8 %rs4, [retval0+4];
+; CHECK-NEXT: ld.param.b8 %rs5, [retval0+3];
; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs7;
-; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs8;
-; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10;
-; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs9;
+; CHECK-NEXT: shl.b16 %rs8, %rs4, 8;
+; CHECK-NEXT: or.b16 %rs9, %rs8, %rs5;
+; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs5;
; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2;
+; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: shr.u16 %rs12, %rs9, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs12;
; CHECK-NEXT: ret;
%r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
ret %s_i8i16p %r
@@ -64,56 +62,51 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
; CHECK-LABEL: test_s_i8i32p(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<20>;
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<24>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8i32p_param_0+6];
-; CHECK-NEXT: shl.b32 %r4, %r3, 8;
-; CHECK-NEXT: ld.param.b8 %r5, [test_s_i8i32p_param_0+5];
-; CHECK-NEXT: or.b32 %r6, %r4, %r5;
-; CHECK-NEXT: ld.param.b8 %r7, [test_s_i8i32p_param_0+7];
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: ld.param.b8 %r9, [test_s_i8i32p_param_0+8];
-; CHECK-NEXT: shl.b32 %r10, %r9, 24;
-; CHECK-NEXT: or.b32 %r11, %r10, %r8;
-; CHECK-NEXT: or.b32 %r2, %r11, %r6;
-; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i32p_param_0+16];
-; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8i32p_param_0+4];
; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i32p_param_0];
-; CHECK-NEXT: shr.u32 %r12, %r2, 8;
-; CHECK-NEXT: shr.u32 %r13, %r11, 16;
+; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8i32p_param_0+4];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i32p_param_0+16];
+; CHECK-NEXT: ld.param.b8 %r2, [test_s_i8i32p_param_0+6];
+; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8i32p_param_0+7];
+; CHECK-NEXT: ld.param.b8 %r4, [test_s_i8i32p_param_0+8];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 8 .b8 param0[24];
-; CHECK-NEXT: st.param.b32 [param0], %r1;
-; CHECK-NEXT: st.param.b8 [param0+4], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+5], %r2;
-; CHECK-NEXT: st.param.b8 [param0+6], %r12;
-; CHECK-NEXT: st.param.b8 [param0+7], %r13;
-; CHECK-NEXT: st.param.b8 [param0+8], %r9;
-; CHECK-NEXT: st.param.b64 [param0+16], %rd1;
; CHECK-NEXT: .param .align 8 .b8 retval0[24];
+; CHECK-NEXT: st.param.b8 [param0+8], %r4;
+; CHECK-NEXT: st.param.b8 [param0+7], %r3;
+; CHECK-NEXT: st.param.b8 [param0+6], %r2;
+; CHECK-NEXT: st.param.b64 [param0+16], %rd1;
+; CHECK-NEXT: st.param.b16 [param0+4], %rs1;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_s_i8i32p, (param0);
-; CHECK-NEXT: ld.param.b32 %r14, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4];
-; CHECK-NEXT: ld.param.b8 %rs3, [retval0+5];
-; CHECK-NEXT: ld.param.b8 %rs4, [retval0+6];
-; CHECK-NEXT: ld.param.b8 %rs5, [retval0+7];
-; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16];
+; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4];
+; CHECK-NEXT: ld.param.b32 %r5, [retval0];
+; CHECK-NEXT: ld.param.b8 %r6, [retval0+8];
+; CHECK-NEXT: ld.param.b8 %r7, [retval0+7];
+; CHECK-NEXT: ld.param.b8 %r8, [retval0+6];
+; CHECK-NEXT: ld.param.b8 %r9, [retval0+5];
; CHECK-NEXT: } // callseq 1
-; CHECK-NEXT: cvt.u32.u16 %r15, %rs3;
-; CHECK-NEXT: cvt.u32.u16 %r16, %rs4;
-; CHECK-NEXT: cvt.u32.u16 %r17, %rs5;
-; CHECK-NEXT: cvt.u32.u16 %r18, %rs6;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18;
-; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17;
-; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16;
-; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15;
+; CHECK-NEXT: shl.b32 %r12, %r8, 8;
+; CHECK-NEXT: or.b32 %r13, %r12, %r9;
+; CHECK-NEXT: shl.b32 %r15, %r7, 16;
+; CHECK-NEXT: shl.b32 %r17, %r6, 24;
+; CHECK-NEXT: or.b32 %r18, %r17, %r15;
+; CHECK-NEXT: or.b32 %r19, %r18, %r13;
+; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9;
; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: shr.u32 %r21, %r19, 24;
+; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21;
+; CHECK-NEXT: shr.u32 %r22, %r19, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22;
+; CHECK-NEXT: shr.u32 %r23, %r19, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23;
; CHECK-NEXT: ret;
%r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
ret %s_i8i32p %r
@@ -123,112 +116,66 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
; CHECK-LABEL: test_s_i8i64p(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<20>;
-; CHECK-NEXT: .reg .b64 %rd<68>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<46>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8i64p_param_0+10];
-; CHECK-NEXT: shl.b64 %rd5, %rd4, 8;
-; CHECK-NEXT: ld.param.b8 %rd6, [test_s_i8i64p_param_0+9];
-; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6;
-; CHECK-NEXT: ld.param.b8 %rd8, [test_s_i8i64p_param_0+11];
-; CHECK-NEXT: shl.b64 %rd9, %rd8, 16;
-; CHECK-NEXT: ld.param.b8 %rd10, [test_s_i8i64p_param_0+12];
-; CHECK-NEXT: shl.b64 %rd11, %rd10, 24;
-; CHECK-NEXT: or.b64 %rd12, %rd11, %rd9;
-; CHECK-NEXT: or.b64 %rd13, %rd12, %rd7;
-; CHECK-NEXT: ld.param.b8 %rd14, [test_s_i8i64p_param_0+14];
-; CHECK-NEXT: shl.b64 %rd15, %rd14, 8;
-; CHECK-NEXT: ld.param.b8 %rd16, [test_s_i8i64p_param_0+13];
-; CHECK-NEXT: or.b64 %rd17, %rd15, %rd16;
-; CHECK-NEXT: ld.param.b8 %rd18, [test_s_i8i64p_param_0+15];
-; CHECK-NEXT: shl.b64 %rd19, %rd18, 16;
-; CHECK-NEXT: ld.param.b8 %rd20, [test_s_i8i64p_param_0+16];
-; CHECK-NEXT: shl.b64 %rd21, %rd20, 24;
-; CHECK-NEXT: or.b64 %rd22, %rd21, %rd19;
-; CHECK-NEXT: or.b64 %rd23, %rd22, %rd17;
-; CHECK-NEXT: shl.b64 %rd24, %rd23, 32;
-; CHECK-NEXT: or.b64 %rd2, %rd24, %rd13;
-; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8i64p_param_0+24];
-; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8i64p_param_0+8];
; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i64p_param_0];
-; CHECK-NEXT: shr.u64 %rd25, %rd2, 8;
-; CHECK-NEXT: shr.u64 %rd26, %rd2, 16;
-; CHECK-NEXT: shr.u64 %rd27, %rd2, 24;
-; CHECK-NEXT: bfe.u64 %rd28, %rd23, 8, 24;
-; CHECK-NEXT: bfe.u64 %rd29, %rd23, 16, 16;
-; CHECK-NEXT: bfe.u64 %rd30, %rd23, 24, 8;
+; CHECK-NEXT: ld.param.b64 %rd2, [test_s_i8i64p_param_0+8];
+; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8i64p_param_0+24];
+; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8i64p_param_0+16];
; CHECK-NEXT: { // callseq 2, 0
; CHECK-NEXT: .param .align 8 .b8 param0[32];
-; CHECK-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-NEXT: st.param.b8 [param0+8], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+9], %rd2;
-; CHECK-NEXT: st.param.b8 [param0+10], %rd25;
-; CHECK-NEXT: st.param.b8 [param0+11], %rd26;
-; CHECK-NEXT: st.param.b8 [param0+12], %rd27;
-; CHECK-NEXT: st.param.b8 [param0+13], %rd23;
-; CHECK-NEXT: st.param.b8 [param0+14], %rd28;
-; CHECK-NEXT: st.param.b8 [param0+15], %rd29;
-; CHECK-NEXT: st.param.b8 [param0+16], %rd30;
-; CHECK-NEXT: st.param.b64 [param0+24], %rd3;
; CHECK-NEXT: .param .align 8 .b8 retval0[32];
+; CHECK-NEXT: st.param.b8 [param0+16], %rd4;
+; CHECK-NEXT: st.param.b64 [param0+24], %rd3;
+; CHECK-NEXT: st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
; CHECK-NEXT: call.uni (retval0), test_s_i8i64p, (param0);
-; CHECK-NEXT: ld.param.b64 %rd31, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs2, [retval0+8];
-; CHECK-NEXT: ld.param.b8 %rs3, [retval0+9];
-; CHECK-NEXT: ld.param.b8 %rs4, [retval0+10];
-; CHECK-NEXT: ld.param.b8 %rs5, [retval0+11];
-; CHECK-NEXT: ld.param.b8 %rs6, [retval0+12];
-; CHECK-NEXT: ld.param.b8 %rs7, [retval0+13];
-; CHECK-NEXT: ld.param.b8 %rs8, [retval0+14];
-; CHECK-NEXT: ld.param.b8 %rs9, [retval0+15];
-; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16];
-; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24];
+; CHECK-NEXT: ld.param.b64 %rd5, [retval0+24];
+; CHECK-NEXT: ld.param.b8 %rs1, [retval0+8];
+; CHECK-NEXT: ld.param.b64 %rd6, [retval0];
+; CHECK-NEXT: ld.param.b8 %rd7, [retval0+16];
+; CHECK-NEXT: ld.param.b8 %rd8, [retval0+15];
+; CHECK-NEXT: ld.param.b8 %rd9, [retval0+14];
+; CHECK-NEXT: ld.param.b8 %rd10, [retval0+13];
+; CHECK-NEXT: ld.param.b8 %rd11, [retval0+12];
+; CHECK-NEXT: ld.param.b8 %rd12, [retval0+11];
+; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10];
+; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9];
; CHECK-NEXT: } // callseq 2
-; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3;
-; CHECK-NEXT: and.b64 %rd34, %rd33, 255;
-; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4;
-; CHECK-NEXT: and.b64 %rd36, %rd35, 255;
-; CHECK-NEXT: shl.b64 %rd37, %rd36, 8;
-; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37;
-; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5;
-; CHECK-NEXT: and.b64 %rd40, %rd39, 255;
-; CHECK-NEXT: shl.b64 %rd41, %rd40, 16;
-; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41;
-; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6;
-; CHECK-NEXT: and.b64 %rd44, %rd43, 255;
-; CHECK-NEXT: shl.b64 %rd45, %rd44, 24;
-; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45;
-; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7;
-; CHECK-NEXT: and.b64 %rd48, %rd47, 255;
-; CHECK-NEXT: shl.b64 %rd49, %rd48, 32;
-; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49;
-; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8;
-; CHECK-NEXT: and.b64 %rd52, %rd51, 255;
-; CHECK-NEXT: shl.b64 %rd53, %rd52, 40;
-; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53;
-; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9;
-; CHECK-NEXT: and.b64 %rd56, %rd55, 255;
-; CHECK-NEXT: shl.b64 %rd57, %rd56, 48;
-; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57;
-; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10;
-; CHECK-NEXT: shl.b64 %rd60, %rd59, 56;
-; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd31;
-; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2;
+; CHECK-NEXT: shl.b64 %rd17, %rd13, 8;
+; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14;
+; CHECK-NEXT: shl.b64 %rd20, %rd12, 16;
+; CHECK-NEXT: shl.b64 %rd22, %rd11, 24;
+; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20;
+; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18;
+; CHECK-NEXT: shl.b64 %rd27, %rd9, 8;
+; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10;
+; CHECK-NEXT: shl.b64 %rd30, %rd8, 16;
+; CHECK-NEXT: shl.b64 %rd32, %rd7, 24;
+; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30;
+; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28;
+; CHECK-NEXT: shl.b64 %rd35, %rd34, 32;
+; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24;
+; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14;
+; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5;
+; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT: shr.u64 %rd39, %rd36, 56;
+; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39;
+; CHECK-NEXT: shr.u64 %rd40, %rd36, 48;
+; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40;
+; CHECK-NEXT: shr.u64 %rd41, %rd36, 40;
+; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41;
+; CHECK-NEXT: shr.u64 %rd42, %rd36, 32;
+; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42;
+; CHECK-NEXT: shr.u64 %rd43, %rd36, 24;
; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43;
-; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39;
-; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35;
-; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33;
-; CHECK-NEXT: shr.u64 %rd64, %rd50, 32;
-; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64;
-; CHECK-NEXT: shr.u64 %rd65, %rd54, 40;
-; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65;
-; CHECK-NEXT: shr.u64 %rd66, %rd58, 48;
-; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66;
-; CHECK-NEXT: shr.u64 %rd67, %rd61, 56;
-; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67;
-; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32;
+; CHECK-NEXT: shr.u64 %rd44, %rd36, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44;
+; CHECK-NEXT: shr.u64 %rd45, %rd36, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45;
; CHECK-NEXT: ret;
%r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
ret %s_i8i64p %r
@@ -242,33 +189,32 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs4, [test_s_i8f16p_param_0+4];
-; CHECK-NEXT: shl.b16 %rs5, %rs4, 8;
-; CHECK-NEXT: ld.param.b8 %rs6, [test_s_i8f16p_param_0+3];
-; CHECK-NEXT: or.b16 %rs3, %rs5, %rs6;
-; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16p_param_0+8];
-; CHECK-NEXT: ld.param.b8 %rs2, [test_s_i8f16p_param_0+2];
; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f16p_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [test_s_i8f16p_param_0+2];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16p_param_0+8];
+; CHECK-NEXT: ld.param.b8 %rs3, [test_s_i8f16p_param_0+4];
; CHECK-NEXT: { // callseq 3, 0
; CHECK-NEXT: .param .align 8 .b8 param0[16];
-; CHECK-NEXT: st.param.b16 [param0], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+2], %rs2;
-; CHECK-NEXT: st.param.b8 [param0+3], %rs3;
-; CHECK-NEXT: st.param.b8 [param0+4], %rs4;
-; CHECK-NEXT: st.param.b64 [param0+8], %rd1;
; CHECK-NEXT: .param .align 8 .b8 retval0[16];
+; CHECK-NEXT: st.param.b8 [param0+4], %rs3;
+; CHECK-NEXT: st.param.b64 [param0+8], %rd1;
+; CHECK-NEXT: st.param.b16 [param0+2], %rs2;
+; CHECK-NEXT: st.param.b16 [param0], %rs1;
; CHECK-NEXT: call.uni (retval0), test_s_i8f16p, (param0);
-; CHECK-NEXT: ld.param.b16 %rs7, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs8, [retval0+2];
-; CHECK-NEXT: ld.param.b8 %rs9, [retval0+3];
-; CHECK-NEXT: ld.param.b8 %rs10, [retval0+4];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+8];
+; CHECK-NEXT: ld.param.b8 %rs4, [retval0+2];
+; CHECK-NEXT: ld.param.b16 %rs5, [retval0];
+; CHECK-NEXT: ld.param.b8 %rs6, [retval0+4];
+; CHECK-NEXT: ld.param.b8 %rs7, [retval0+3];
; CHECK-NEXT: } // callseq 3
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs7;
-; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs8;
-; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10;
-; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs9;
+; CHECK-NEXT: shl.b16 %rs10, %rs6, 8;
+; CHECK-NEXT: or.b16 %rs11, %rs10, %rs7;
+; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs7;
; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2;
+; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs4;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs5;
+; CHECK-NEXT: shr.u16 %rs14, %rs11, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs14;
; CHECK-NEXT: ret;
%r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
ret %s_i8f16p %r
@@ -278,56 +224,51 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
; CHECK-LABEL: test_s_i8f16x2p(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<20>;
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<24>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f16x2p_param_0+6];
-; CHECK-NEXT: shl.b32 %r4, %r3, 8;
-; CHECK-NEXT: ld.param.b8 %r5, [test_s_i8f16x2p_param_0+5];
-; CHECK-NEXT: or.b32 %r6, %r4, %r5;
-; CHECK-NEXT: ld.param.b8 %r7, [test_s_i8f16x2p_param_0+7];
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: ld.param.b8 %r9, [test_s_i8f16x2p_param_0+8];
-; CHECK-NEXT: shl.b32 %r10, %r9, 24;
-; CHECK-NEXT: or.b32 %r11, %r10, %r8;
-; CHECK-NEXT: or.b32 %r2, %r11, %r6;
-; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16x2p_param_0+16];
-; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8f16x2p_param_0+4];
; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f16x2p_param_0];
-; CHECK-NEXT: shr.u32 %r12, %r2, 8;
-; CHECK-NEXT: shr.u32 %r13, %r11, 16;
+; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f16x2p_param_0+4];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16x2p_param_0+16];
+; CHECK-NEXT: ld.param.b8 %r2, [test_s_i8f16x2p_param_0+6];
+; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f16x2p_param_0+7];
+; CHECK-NEXT: ld.param.b8 %r4, [test_s_i8f16x2p_param_0+8];
; CHECK-NEXT: { // callseq 4, 0
; CHECK-NEXT: .param .align 8 .b8 param0[24];
-; CHECK-NEXT: st.param.b32 [param0], %r1;
-; CHECK-NEXT: st.param.b8 [param0+4], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+5], %r2;
-; CHECK-NEXT: st.param.b8 [param0+6], %r12;
-; CHECK-NEXT: st.param.b8 [param0+7], %r13;
-; CHECK-NEXT: st.param.b8 [param0+8], %r9;
-; CHECK-NEXT: st.param.b64 [param0+16], %rd1;
; CHECK-NEXT: .param .align 8 .b8 retval0[24];
+; CHECK-NEXT: st.param.b8 [param0+8], %r4;
+; CHECK-NEXT: st.param.b8 [param0+7], %r3;
+; CHECK-NEXT: st.param.b8 [param0+6], %r2;
+; CHECK-NEXT: st.param.b64 [param0+16], %rd1;
+; CHECK-NEXT: st.param.b16 [param0+4], %rs1;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_s_i8f16x2p, (param0);
-; CHECK-NEXT: ld.param.b32 %r14, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4];
-; CHECK-NEXT: ld.param.b8 %rs3, [retval0+5];
-; CHECK-NEXT: ld.param.b8 %rs4, [retval0+6];
-; CHECK-NEXT: ld.param.b8 %rs5, [retval0+7];
-; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16];
+; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4];
+; CHECK-NEXT: ld.param.b32 %r5, [retval0];
+; CHECK-NEXT: ld.param.b8 %r6, [retval0+8];
+; CHECK-NEXT: ld.param.b8 %r7, [retval0+7];
+; CHECK-NEXT: ld.param.b8 %r8, [retval0+6];
+; CHECK-NEXT: ld.param.b8 %r9, [retval0+5];
; CHECK-NEXT: } // callseq 4
-; CHECK-NEXT: cvt.u32.u16 %r15, %rs3;
-; CHECK-NEXT: cvt.u32.u16 %r16, %rs4;
-; CHECK-NEXT: cvt.u32.u16 %r17, %rs5;
-; CHECK-NEXT: cvt.u32.u16 %r18, %rs6;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18;
-; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17;
-; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16;
-; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15;
+; CHECK-NEXT: shl.b32 %r12, %r8, 8;
+; CHECK-NEXT: or.b32 %r13, %r12, %r9;
+; CHECK-NEXT: shl.b32 %r15, %r7, 16;
+; CHECK-NEXT: shl.b32 %r17, %r6, 24;
+; CHECK-NEXT: or.b32 %r18, %r17, %r15;
+; CHECK-NEXT: or.b32 %r19, %r18, %r13;
+; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9;
; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: shr.u32 %r21, %r19, 24;
+; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21;
+; CHECK-NEXT: shr.u32 %r22, %r19, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22;
+; CHECK-NEXT: shr.u32 %r23, %r19, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23;
; CHECK-NEXT: ret;
%r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
ret %s_i8f16x2p %r
@@ -337,56 +278,51 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
; CHECK-LABEL: test_s_i8f32p(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<20>;
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<24>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f32p_param_0+6];
-; CHECK-NEXT: shl.b32 %r4, %r3, 8;
-; CHECK-NEXT: ld.param.b8 %r5, [test_s_i8f32p_param_0+5];
-; CHECK-NEXT: or.b32 %r6, %r4, %r5;
-; CHECK-NEXT: ld.param.b8 %r7, [test_s_i8f32p_param_0+7];
-; CHECK-NEXT: shl.b32 %r8, %r7, 16;
-; CHECK-NEXT: ld.param.b8 %r9, [test_s_i8f32p_param_0+8];
-; CHECK-NEXT: shl.b32 %r10, %r9, 24;
-; CHECK-NEXT: or.b32 %r11, %r10, %r8;
-; CHECK-NEXT: or.b32 %r2, %r11, %r6;
-; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f32p_param_0+16];
-; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8f32p_param_0+4];
; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f32p_param_0];
-; CHECK-NEXT: shr.u32 %r12, %r2, 8;
-; CHECK-NEXT: shr.u32 %r13, %r11, 16;
+; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f32p_param_0+4];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f32p_param_0+16];
+; CHECK-NEXT: ld.param.b8 %r2, [test_s_i8f32p_param_0+6];
+; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f32p_param_0+7];
+; CHECK-NEXT: ld.param.b8 %r4, [test_s_i8f32p_param_0+8];
; CHECK-NEXT: { // callseq 5, 0
; CHECK-NEXT: .param .align 8 .b8 param0[24];
-; CHECK-NEXT: st.param.b32 [param0], %r1;
-; CHECK-NEXT: st.param.b8 [param0+4], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+5], %r2;
-; CHECK-NEXT: st.param.b8 [param0+6], %r12;
-; CHECK-NEXT: st.param.b8 [param0+7], %r13;
-; CHECK-NEXT: st.param.b8 [param0+8], %r9;
-; CHECK-NEXT: st.param.b64 [param0+16], %rd1;
; CHECK-NEXT: .param .align 8 .b8 retval0[24];
+; CHECK-NEXT: st.param.b8 [param0+8], %r4;
+; CHECK-NEXT: st.param.b8 [param0+7], %r3;
+; CHECK-NEXT: st.param.b8 [param0+6], %r2;
+; CHECK-NEXT: st.param.b64 [param0+16], %rd1;
+; CHECK-NEXT: st.param.b16 [param0+4], %rs1;
+; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni (retval0), test_s_i8f32p, (param0);
-; CHECK-NEXT: ld.param.b32 %r14, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4];
-; CHECK-NEXT: ld.param.b8 %rs3, [retval0+5];
-; CHECK-NEXT: ld.param.b8 %rs4, [retval0+6];
-; CHECK-NEXT: ld.param.b8 %rs5, [retval0+7];
-; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16];
+; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4];
+; CHECK-NEXT: ld.param.b32 %r5, [retval0];
+; CHECK-NEXT: ld.param.b8 %r6, [retval0+8];
+; CHECK-NEXT: ld.param.b8 %r7, [retval0+7];
+; CHECK-NEXT: ld.param.b8 %r8, [retval0+6];
+; CHECK-NEXT: ld.param.b8 %r9, [retval0+5];
; CHECK-NEXT: } // callseq 5
-; CHECK-NEXT: cvt.u32.u16 %r15, %rs3;
-; CHECK-NEXT: cvt.u32.u16 %r16, %rs4;
-; CHECK-NEXT: cvt.u32.u16 %r17, %rs5;
-; CHECK-NEXT: cvt.u32.u16 %r18, %rs6;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18;
-; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17;
-; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16;
-; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15;
+; CHECK-NEXT: shl.b32 %r12, %r8, 8;
+; CHECK-NEXT: or.b32 %r13, %r12, %r9;
+; CHECK-NEXT: shl.b32 %r15, %r7, 16;
+; CHECK-NEXT: shl.b32 %r17, %r6, 24;
+; CHECK-NEXT: or.b32 %r18, %r17, %r15;
+; CHECK-NEXT: or.b32 %r19, %r18, %r13;
+; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9;
; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2;
+; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: shr.u32 %r21, %r19, 24;
+; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21;
+; CHECK-NEXT: shr.u32 %r22, %r19, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22;
+; CHECK-NEXT: shr.u32 %r23, %r19, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23;
; CHECK-NEXT: ret;
%r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
ret %s_i8f32p %r
@@ -396,112 +332,66 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
; CHECK-LABEL: test_s_i8f64p(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<20>;
-; CHECK-NEXT: .reg .b64 %rd<68>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<46>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8f64p_param_0+10];
-; CHECK-NEXT: shl.b64 %rd5, %rd4, 8;
-; CHECK-NEXT: ld.param.b8 %rd6, [test_s_i8f64p_param_0+9];
-; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6;
-; CHECK-NEXT: ld.param.b8 %rd8, [test_s_i8f64p_param_0+11];
-; CHECK-NEXT: shl.b64 %rd9, %rd8, 16;
-; CHECK-NEXT: ld.param.b8 %rd10, [test_s_i8f64p_param_0+12];
-; CHECK-NEXT: shl.b64 %rd11, %rd10, 24;
-; CHECK-NEXT: or.b64 %rd12, %rd11, %rd9;
-; CHECK-NEXT: or.b64 %rd13, %rd12, %rd7;
-; CHECK-NEXT: ld.param.b8 %rd14, [test_s_i8f64p_param_0+14];
-; CHECK-NEXT: shl.b64 %rd15, %rd14, 8;
-; CHECK-NEXT: ld.param.b8 %rd16, [test_s_i8f64p_param_0+13];
-; CHECK-NEXT: or.b64 %rd17, %rd15, %rd16;
-; CHECK-NEXT: ld.param.b8 %rd18, [test_s_i8f64p_param_0+15];
-; CHECK-NEXT: shl.b64 %rd19, %rd18, 16;
-; CHECK-NEXT: ld.param.b8 %rd20, [test_s_i8f64p_param_0+16];
-; CHECK-NEXT: shl.b64 %rd21, %rd20, 24;
-; CHECK-NEXT: or.b64 %rd22, %rd21, %rd19;
-; CHECK-NEXT: or.b64 %rd23, %rd22, %rd17;
-; CHECK-NEXT: shl.b64 %rd24, %rd23, 32;
-; CHECK-NEXT: or.b64 %rd2, %rd24, %rd13;
-; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8f64p_param_0+24];
-; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8f64p_param_0+8];
; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f64p_param_0];
-; CHECK-NEXT: shr.u64 %rd25, %rd2, 8;
-; CHECK-NEXT: shr.u64 %rd26, %rd2, 16;
-; CHECK-NEXT: shr.u64 %rd27, %rd2, 24;
-; CHECK-NEXT: bfe.u64 %rd28, %rd23, 8, 24;
-; CHECK-NEXT: bfe.u64 %rd29, %rd23, 16, 16;
-; CHECK-NEXT: bfe.u64 %rd30, %rd23, 24, 8;
+; CHECK-NEXT: ld.param.b64 %rd2, [test_s_i8f64p_param_0+8];
+; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8f64p_param_0+24];
+; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8f64p_param_0+16];
; CHECK-NEXT: { // callseq 6, 0
; CHECK-NEXT: .param .align 8 .b8 param0[32];
-; CHECK-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-NEXT: st.param.b8 [param0+8], %rs1;
-; CHECK-NEXT: st.param.b8 [param0+9], %rd2;
-; CHECK-NEXT: st.param.b8 [param0+10], %rd25;
-; CHECK-NEXT: st.param.b8 [param0+11], %rd26;
-; CHECK-NEXT: st.param.b8 [param0+12], %rd27;
-; CHECK-NEXT: st.param.b8 [param0+13], %rd23;
-; CHECK-NEXT: st.param.b8 [param0+14], %rd28;
-; CHECK-NEXT: st.param.b8 [param0+15], %rd29;
-; CHECK-NEXT: st.param.b8 [param0+16], %rd30;
-; CHECK-NEXT: st.param.b64 [param0+24], %rd3;
; CHECK-NEXT: .param .align 8 .b8 retval0[32];
+; CHECK-NEXT: st.param.b8 [param0+16], %rd4;
+; CHECK-NEXT: st.param.b64 [param0+24], %rd3;
+; CHECK-NEXT: st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
; CHECK-NEXT: call.uni (retval0), test_s_i8f64p, (param0);
-; CHECK-NEXT: ld.param.b64 %rd31, [retval0];
-; CHECK-NEXT: ld.param.b8 %rs2, [retval0+8];
-; CHECK-NEXT: ld.param.b8 %rs3, [retval0+9];
-; CHECK-NEXT: ld.param.b8 %rs4, [retval0+10];
-; CHECK-NEXT: ld.param.b8 %rs5, [retval0+11];
-; CHECK-NEXT: ld.param.b8 %rs6, [retval0+12];
-; CHECK-NEXT: ld.param.b8 %rs7, [retval0+13];
-; CHECK-NEXT: ld.param.b8 %rs8, [retval0+14];
-; CHECK-NEXT: ld.param.b8 %rs9, [retval0+15];
-; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16];
-; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24];
+; CHECK-NEXT: ld.param.b64 %rd5, [retval0+24];
+; CHECK-NEXT: ld.param.b8 %rs1, [retval0+8];
+; CHECK-NEXT: ld.param.b64 %rd6, [retval0];
+; CHECK-NEXT: ld.param.b8 %rd7, [retval0+16];
+; CHECK-NEXT: ld.param.b8 %rd8, [retval0+15];
+; CHECK-NEXT: ld.param.b8 %rd9, [retval0+14];
+; CHECK-NEXT: ld.param.b8 %rd10, [retval0+13];
+; CHECK-NEXT: ld.param.b8 %rd11, [retval0+12];
+; CHECK-NEXT: ld.param.b8 %rd12, [retval0+11];
+; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10];
+; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9];
; CHECK-NEXT: } // callseq 6
-; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3;
-; CHECK-NEXT: and.b64 %rd34, %rd33, 255;
-; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4;
-; CHECK-NEXT: and.b64 %rd36, %rd35, 255;
-; CHECK-NEXT: shl.b64 %rd37, %rd36, 8;
-; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37;
-; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5;
-; CHECK-NEXT: and.b64 %rd40, %rd39, 255;
-; CHECK-NEXT: shl.b64 %rd41, %rd40, 16;
-; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41;
-; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6;
-; CHECK-NEXT: and.b64 %rd44, %rd43, 255;
-; CHECK-NEXT: shl.b64 %rd45, %rd44, 24;
-; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45;
-; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7;
-; CHECK-NEXT: and.b64 %rd48, %rd47, 255;
-; CHECK-NEXT: shl.b64 %rd49, %rd48, 32;
-; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49;
-; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8;
-; CHECK-NEXT: and.b64 %rd52, %rd51, 255;
-; CHECK-NEXT: shl.b64 %rd53, %rd52, 40;
-; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53;
-; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9;
-; CHECK-NEXT: and.b64 %rd56, %rd55, 255;
-; CHECK-NEXT: shl.b64 %rd57, %rd56, 48;
-; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57;
-; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10;
-; CHECK-NEXT: shl.b64 %rd60, %rd59, 56;
-; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd31;
-; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2;
+; CHECK-NEXT: shl.b64 %rd17, %rd13, 8;
+; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14;
+; CHECK-NEXT: shl.b64 %rd20, %rd12, 16;
+; CHECK-NEXT: shl.b64 %rd22, %rd11, 24;
+; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20;
+; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18;
+; CHECK-NEXT: shl.b64 %rd27, %rd9, 8;
+; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10;
+; CHECK-NEXT: shl.b64 %rd30, %rd8, 16;
+; CHECK-NEXT: shl.b64 %rd32, %rd7, 24;
+; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30;
+; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28;
+; CHECK-NEXT: shl.b64 %rd35, %rd34, 32;
+; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24;
+; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14;
+; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5;
+; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT: shr.u64 %rd39, %rd36, 56;
+; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39;
+; CHECK-NEXT: shr.u64 %rd40, %rd36, 48;
+; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40;
+; CHECK-NEXT: shr.u64 %rd41, %rd36, 40;
+; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41;
+; CHECK-NEXT: shr.u64 %rd42, %rd36, 32;
+; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42;
+; CHECK-NEXT: shr.u64 %rd43, %rd36, 24;
; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43;
-; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39;
-; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35;
-; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33;
-; CHECK-NEXT: shr.u64 %rd64, %rd50, 32;
-; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64;
-; CHECK-NEXT: shr.u64 %rd65, %rd54, 40;
-; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65;
-; CHECK-NEXT: shr.u64 %rd66, %rd58, 48;
-; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66;
-; CHECK-NEXT: shr.u64 %rd67, %rd61, 56;
-; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67;
-; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32;
+; CHECK-NEXT: shr.u64 %rd44, %rd36, 16;
+; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44;
+; CHECK-NEXT: shr.u64 %rd45, %rd36, 8;
+; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45;
; CHECK-NEXT: ret;
%r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
ret %s_i8f64p %r
diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll
index 3ca729f07af8a..9e312a2fec60a 100644
--- a/llvm/test/CodeGen/NVPTX/vaargs.ll
+++ b/llvm/test/CodeGen/NVPTX/vaargs.ll
@@ -89,14 +89,14 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) {
; CHECK-NEXT: ld.param.b32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0];
; Store arguments to an array
-; CHECK32: .param .align 8 .b8 param1[28];
-; CHECK64: .param .align 8 .b8 param1[32];
-; CHECK-NEXT: st.param.b32 [param1], [[ARG_I32]];
-; CHECK-NEXT: st.param.b64 [param1+8], [[ARG_I64]];
-; CHECK-NEXT: st.param.b64 [param1+16], [[ARG_DOUBLE]];
-; CHECK-NEXT: st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]];
-; CHECK-NEXT: .param .b32 retval0;
-; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[]
+; CHECK32: .param .align 8 .b8 param1[28];
+; CHECK64: .param .align 8 .b8 param1[32];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK-DAG: st.param.b32 [param1], [[ARG_I32]];
+; CHECK-DAG: st.param.b64 [param1+8], [[ARG_I64]];
+; CHECK-DAG: st.param.b64 [param1+16], [[ARG_DOUBLE]];
+; CHECK-DAG: st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]];
+; CHECK-DAG: prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[]
entry:
%ptr = load ptr, ptr addrspacecast (ptr addrspace(1) @foo_ptr to ptr), align 8
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index ad2e7044e93bc..a9b3675b67155 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -115,13 +115,13 @@ define dso_local i32 @foo() {
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408;
; CHECK-PTX-NEXT: st.b64 [%SP+32], 4607182418800017408;
-; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
; CHECK-PTX-NEXT: { // callseq 0, 0
; CHECK-PTX-NEXT: .param .b32 param0;
-; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
; CHECK-PTX-NEXT: .param .b32 retval0;
+; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: call.uni (retval0), variadics1, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 0
@@ -218,13 +218,13 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
; CHECK-PTX-NEXT: st.b8 [%SP+12], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
-; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8;
; CHECK-PTX-NEXT: { // callseq 1, 0
; CHECK-PTX-NEXT: .param .b32 param0;
-; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3;
; CHECK-PTX-NEXT: .param .b32 retval0;
+; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3;
+; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 1
@@ -289,13 +289,13 @@ define dso_local i32 @baz() {
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot5;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: st.v4.b32 [%SP], {1, 1, 1, 1};
-; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
; CHECK-PTX-NEXT: { // callseq 2, 0
; CHECK-PTX-NEXT: .param .b32 param0;
-; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
; CHECK-PTX-NEXT: .param .b32 retval0;
+; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: call.uni (retval0), variadics3, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 2
@@ -348,7 +348,6 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
-; CHECK-PTX-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
@@ -360,18 +359,17 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: ld.global.nc.b64 %rd4, [__const_$_qux_$_s];
; CHECK-PTX-NEXT: st.local.b64 [%rd2], %rd4;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
-; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd2];
-; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8];
-; CHECK-PTX-NEXT: add.u64 %rd7, %SP, 16;
; CHECK-PTX-NEXT: { // callseq 3, 0
; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16];
-; CHECK-PTX-NEXT: st.param.b64 [param0], %rd5;
-; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd7;
; CHECK-PTX-NEXT: .param .b32 retval0;
+; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 16;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8];
+; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6;
+; CHECK-PTX-NEXT: ld.local.b64 %rd7, [%rd2];
+; CHECK-PTX-NEXT: st.param.b64 [param0], %rd7;
; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1);
-; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 3
; CHECK-PTX-NEXT: ret;
entry:
More information about the llvm-commits
mailing list