[llvm] a9de1ab - [NVPTX] Disable v2f32 registers when no operations supported, or via cl::opt (#154476)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 21 10:30:41 PDT 2025
Author: Alex MacLean
Date: 2025-08-21T10:30:36-07:00
New Revision: a9de1ab44df11ba82ee2fb5773abc920ec67cd73
URL: https://github.com/llvm/llvm-project/commit/a9de1ab44df11ba82ee2fb5773abc920ec67cd73
DIFF: https://github.com/llvm/llvm-project/commit/a9de1ab44df11ba82ee2fb5773abc920ec67cd73.diff
LOG: [NVPTX] Disable v2f32 registers when no operations supported, or via cl::opt (#154476)
The addition of v2f32 as a legal type, supported by the B64 register
class, has caused performance regressions, broken inline assembly, and
resulted in a couple (now fixed) mis-compilations. In order to mitigate
these issues, only mark this as a legal type when there exist operations
that support it, since for targets where this is not the case it serves
no purpose. To enable further debugging, add an option to disable v2f32.
In order to allow for a target-dependent set of legal types,
ComputePTXValueVTs has been fully re-written to take advantage of
TargetLowering call-lowering APIs.
Added:
llvm/test/CodeGen/NVPTX/no-f32x2.ll
Modified:
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
llvm/lib/Target/NVPTX/NVPTXSubtarget.h
llvm/test/CodeGen/NVPTX/aggregate-return.ll
llvm/test/CodeGen/NVPTX/bf16-instructions.ll
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
llvm/test/CodeGen/NVPTX/ldparam-v4.ll
llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
llvm/test/CodeGen/NVPTX/mulwide.ll
llvm/test/CodeGen/NVPTX/param-load-store.ll
llvm/test/CodeGen/NVPTX/pr126337.ll
llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
llvm/test/CodeGen/NVPTX/vec-param-load.ll
llvm/test/CodeGen/NVPTX/vector-loads.ll
llvm/test/CodeGen/NVPTX/vector-stores.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ad56d2f12caf6..bb4bb1195f78b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -196,7 +196,8 @@ static bool IsPTXVectorType(MVT VT) {
// - unsigned int NumElts - The number of elements in the final vector
// - EVT EltVT - The type of the elements in the final vector
static std::optional<std::pair<unsigned int, MVT>>
-getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
+getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
+ unsigned AddressSpace) {
if (!VectorEVT.isSimple())
return std::nullopt;
const MVT VectorVT = VectorEVT.getSimpleVT();
@@ -213,6 +214,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
// The size of the PTX virtual register that holds a packed type.
unsigned PackRegSize;
+ bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
+
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
// but I'm leaving that as a TODO for now.
@@ -263,6 +266,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
LLVM_FALLTHROUGH;
case MVT::v2f32: // <1 x f32x2>
case MVT::v4f32: // <2 x f32x2>
+ if (!STI.hasF32x2Instructions())
+ return std::pair(NumElts, EltVT);
PackRegSize = 64;
break;
}
@@ -278,97 +283,44 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
}
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
-/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
-/// into their primitive components.
+/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
+/// the types as required by the calling convention (with special handling for
+/// i8s).
/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
/// LowerCall, and LowerReturn.
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+ LLVMContext &Ctx, CallingConv::ID CallConv,
Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<uint64_t> *Offsets = nullptr,
+ SmallVectorImpl<uint64_t> &Offsets,
uint64_t StartingOffset = 0) {
SmallVector<EVT, 16> TempVTs;
SmallVector<uint64_t, 16> TempOffsets;
-
- // Special case for i128 - decompose to (i64, i64)
- if (Ty->isIntegerTy(128) || Ty->isFP128Ty()) {
- ValueVTs.append({MVT::i64, MVT::i64});
-
- if (Offsets)
- Offsets->append({StartingOffset + 0, StartingOffset + 8});
-
- return;
- }
-
- // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
- if (StructType *STy = dyn_cast<StructType>(Ty)) {
- auto const *SL = DL.getStructLayout(STy);
- auto ElementNum = 0;
- for(auto *EI : STy->elements()) {
- ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
- StartingOffset + SL->getElementOffset(ElementNum));
- ++ElementNum;
- }
- return;
- }
-
- // Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.
- if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
- Type *EltTy = ATy->getElementType();
- uint64_t EltSize = DL.getTypeAllocSize(EltTy);
- for (int I : llvm::seq<int>(ATy->getNumElements()))
- ComputePTXValueVTs(TLI, DL, EltTy, ValueVTs, Offsets, StartingOffset + I * EltSize);
- return;
- }
-
- // Will split structs and arrays into member types, but will not split vector
- // types. We do that manually below.
ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
- for (auto [VT, Off] : zip(TempVTs, TempOffsets)) {
- // Split vectors into individual elements that fit into registers.
- if (VT.isVector()) {
- unsigned NumElts = VT.getVectorNumElements();
- EVT EltVT = VT.getVectorElementType();
- // Below we must maintain power-of-2 sized vectors because
- // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
- // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
- // vectors.
-
- // If the element type belongs to one of the supported packed vector types
- // then we can pack multiples of this element into a single register.
- if (VT == MVT::v2i8) {
- // We can pack 2 i8s into a single 16-bit register. We only do this for
- // loads and stores, which is why we have a separate case for it.
- EltVT = MVT::v2i8;
- NumElts = 1;
- } else if (VT == MVT::v3i8) {
- // We can also pack 3 i8s into 32-bit register, leaving the 4th
- // element undefined.
- EltVT = MVT::v4i8;
- NumElts = 1;
- } else if (NumElts > 1 && isPowerOf2_32(NumElts)) {
- // Handle default packed types.
- for (MVT PackedVT : NVPTX::packed_types()) {
- const auto NumEltsPerReg = PackedVT.getVectorNumElements();
- if (NumElts % NumEltsPerReg == 0 &&
- EltVT == PackedVT.getVectorElementType()) {
- EltVT = PackedVT;
- NumElts /= NumEltsPerReg;
- break;
- }
- }
- }
+ for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
+ MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
+ unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+
+ // Since we actually can load/store b8, we need to ensure that we'll use
+ // the original sized type for any i8s or i8 vectors.
+ if (VT.getScalarType() == MVT::i8) {
+ if (RegisterVT == MVT::i16)
+ RegisterVT = MVT::i8;
+ else if (RegisterVT == MVT::v2i16)
+ RegisterVT = MVT::v2i8;
+ else
+ assert(RegisterVT == MVT::v4i8 &&
+ "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
+ }
- for (unsigned J : seq(NumElts)) {
- ValueVTs.push_back(EltVT);
- if (Offsets)
- Offsets->push_back(Off + J * EltVT.getStoreSize());
- }
- } else {
- ValueVTs.push_back(VT);
- if (Offsets)
- Offsets->push_back(Off);
+ // TODO: This is horribly incorrect for cases where the vector elements are
+ // not a multiple of bytes (ex i1) and legal or i8. However, this problem
+ // has existed for as long as NVPTX has and no one has complained, so we'll
+ // leave it for now.
+ for (unsigned I : seq(NumRegs)) {
+ ValueVTs.push_back(RegisterVT);
+ Offsets.push_back(Off + I * RegisterVT.getStoreSize());
}
}
}
@@ -631,7 +583,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
- addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
+
+ if (STI.hasF32x2Instructions())
+ addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
// Conversion to/from FP16/FP16x2 is always legal.
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -672,7 +626,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
// Need custom lowering in case the index is dynamic.
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+ if (STI.hasF32x2Instructions())
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
@@ -1606,7 +1561,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, Arg.Ty, VTs, &Offsets, VAOffset);
+ ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
+ VAOffset);
assert(VTs.size() == Offsets.size() && "Size mismatch");
assert(VTs.size() == ArgOuts.size() && "Size mismatch");
@@ -1756,7 +1712,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!Ins.empty()) {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+ ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
assert(VTs.size() == Ins.size() && "Bad value decomposition");
const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
@@ -3217,8 +3173,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
if (ValVT != MemVT)
return SDValue();
- const auto NumEltsAndEltVT = getVectorLoweringShape(
- ValVT, STI.has256BitVectorLoadStore(N->getAddressSpace()));
+ const auto NumEltsAndEltVT =
+ getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
if (!NumEltsAndEltVT)
return SDValue();
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
@@ -3386,6 +3342,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
const DataLayout &DL = DAG.getDataLayout();
+ LLVMContext &Ctx = *DAG.getContext();
auto PtrVT = getPointerTy(DAG.getDataLayout());
const Function &F = DAG.getMachineFunction().getFunction();
@@ -3457,7 +3414,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
} else {
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
+ ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
assert(VTs.size() == ArgIns.size() && "Size mismatch");
assert(VTs.size() == Offsets.size() && "Size mismatch");
@@ -3469,7 +3426,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
for (const unsigned NumElts : VI) {
// i1 is loaded/stored as i8
const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
- const EVT VecVT = getVectorizedVT(LoadVT, NumElts, *DAG.getContext());
+ const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
SDValue VecAddr = DAG.getObjectPtrOffset(
dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
@@ -3514,6 +3471,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
const DataLayout &DL = DAG.getDataLayout();
+ LLVMContext &Ctx = *DAG.getContext();
const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
@@ -3526,7 +3484,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+ ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
const auto GetRetVal = [&](unsigned I) -> SDValue {
@@ -5985,8 +5943,8 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
if (ResVT != MemVT)
return;
- const auto NumEltsAndEltVT = getVectorLoweringShape(
- ResVT, STI.has256BitVectorLoadStore(LD->getAddressSpace()));
+ const auto NumEltsAndEltVT =
+ getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
if (!NumEltsAndEltVT)
return;
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index e5d680c19d921..a84ceaba991c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -29,6 +29,12 @@ static cl::opt<bool>
NoF16Math("nvptx-no-f16-math", cl::Hidden,
cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
cl::init(false));
+
+static cl::opt<bool> NoF32x2("nvptx-no-f32x2", cl::Hidden,
+ cl::desc("NVPTX Specific: Disable generation of "
+ "f32x2 instructions and registers."),
+ cl::init(false));
+
// Pin the vtable to this file.
void NVPTXSubtarget::anchor() {}
@@ -70,6 +76,10 @@ bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
+bool NVPTXSubtarget::hasF32x2Instructions() const {
+ return SmVersion >= 100 && PTXVersion >= 86 && !NoF32x2;
+}
+
bool NVPTXSubtarget::hasNativeBF16Support(int Opcode) const {
if (!hasBF16Math())
return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 81af55edccadb..acf025b70ce34 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -117,9 +117,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
return HasTcgen05 && PTXVersion >= 86;
}
// f32x2 instructions in Blackwell family
- bool hasF32x2Instructions() const {
- return SmVersion >= 100 && PTXVersion >= 86;
- }
+ bool hasF32x2Instructions() const;
// TMA G2S copy with cta_group::1/2 support
bool hasCpAsyncBulkTensorCTAGroupSupport() const {
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index bf51973e88357..fab60bdb3f2d1 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -10,19 +10,20 @@ declare {float, float} @bars({float, float} %input)
define void @test_v2f32(<2 x float> %input, ptr %output) {
; CHECK-LABEL: test_v2f32(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0];
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
; CHECK-NEXT: call.uni (retval0), barv, (param0);
-; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [retval0];
; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: ld.param.b64 %rd3, [test_v2f32_param_1];
-; CHECK-NEXT: st.b64 [%rd3], %rd2;
+; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_1];
+; CHECK-NEXT: st.v2.b32 [%rd1], {%r3, %r4};
; CHECK-NEXT: ret;
%call = tail call <2 x float> @barv(<2 x float> %input)
store <2 x float> %call, ptr %output, align 8
@@ -32,24 +33,28 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
define void @test_v3f32(<3 x float> %input, ptr %output) {
; CHECK-LABEL: test_v3f32(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8];
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 16 .b8 param0[16];
; CHECK-NEXT: .param .align 16 .b8 retval0[16];
-; CHECK-NEXT: st.param.b32 [param0+8], %r1;
-; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: st.param.b32 [param0+8], %r3;
+; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
; CHECK-NEXT: call.uni (retval0), barv3, (param0);
-; CHECK-NEXT: ld.param.b32 %r2, [retval0+8];
-; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT: ld.param.b32 %r4, [retval0+8];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0];
; CHECK-NEXT: } // callseq 1
-; CHECK-NEXT: ld.param.b64 %rd3, [test_v3f32_param_1];
-; CHECK-NEXT: st.b32 [%rd3+8], %r2;
-; CHECK-NEXT: st.b64 [%rd3], %rd2;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r5;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r6;
+; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
+; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
+; CHECK-NEXT: ld.param.b64 %rd5, [test_v3f32_param_1];
+; CHECK-NEXT: st.b32 [%rd5+8], %r4;
+; CHECK-NEXT: st.b64 [%rd5], %rd4;
; CHECK-NEXT: ret;
%call = tail call <3 x float> @barv3(<3 x float> %input)
; Make sure we don't load more values than than we need to.
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index aee58a044a986..a386e4292777b 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -688,25 +688,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
+; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
; SM70-NEXT: shl.b32 %r6, %r5, 16;
-; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
; SM70-NEXT: shl.b32 %r8, %r7, 16;
-; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
; SM70-NEXT: shl.b32 %r10, %r9, 16;
-; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
+; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
; SM70-NEXT: shl.b32 %r12, %r11, 16;
-; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r4;
-; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
-; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
; SM70-NEXT: shl.b32 %r16, %r15, 16;
-; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r3;
-; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
+; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
; SM70-NEXT: shl.b32 %r18, %r17, 16;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
+; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
; SM70-NEXT: shl.b32 %r20, %r19, 16;
; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r20, %r18, %r16, %r14};
; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r10, %r8, %r6};
@@ -721,18 +721,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT: cvt.f32.bf16 %r5, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %r6, %rs1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT: cvt.f32.bf16 %r7, %rs4;
-; SM80-NEXT: cvt.f32.bf16 %r8, %rs3;
-; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r4;
-; SM80-NEXT: cvt.f32.bf16 %r9, %rs6;
-; SM80-NEXT: cvt.f32.bf16 %r10, %rs5;
-; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r3;
-; SM80-NEXT: cvt.f32.bf16 %r11, %rs8;
-; SM80-NEXT: cvt.f32.bf16 %r12, %rs7;
+; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM80-NEXT: cvt.f32.bf16 %r5, %rs8;
+; SM80-NEXT: cvt.f32.bf16 %r6, %rs7;
+; SM80-NEXT: cvt.f32.bf16 %r7, %rs6;
+; SM80-NEXT: cvt.f32.bf16 %r8, %rs5;
+; SM80-NEXT: cvt.f32.bf16 %r9, %rs4;
+; SM80-NEXT: cvt.f32.bf16 %r10, %rs3;
+; SM80-NEXT: cvt.f32.bf16 %r11, %rs2;
+; SM80-NEXT: cvt.f32.bf16 %r12, %rs1;
; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM80-NEXT: ret;
@@ -746,18 +746,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1;
-; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3;
-; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5;
-; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7;
+; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM80-FTZ-NEXT: ret;
@@ -771,18 +771,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; SM90-NEXT: cvt.f32.bf16 %r5, %rs2;
-; SM90-NEXT: cvt.f32.bf16 %r6, %rs1;
-; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM90-NEXT: cvt.f32.bf16 %r7, %rs4;
-; SM90-NEXT: cvt.f32.bf16 %r8, %rs3;
-; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r4;
-; SM90-NEXT: cvt.f32.bf16 %r9, %rs6;
-; SM90-NEXT: cvt.f32.bf16 %r10, %rs5;
-; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r3;
-; SM90-NEXT: cvt.f32.bf16 %r11, %rs8;
-; SM90-NEXT: cvt.f32.bf16 %r12, %rs7;
+; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM90-NEXT: cvt.f32.bf16 %r5, %rs8;
+; SM90-NEXT: cvt.f32.bf16 %r6, %rs7;
+; SM90-NEXT: cvt.f32.bf16 %r7, %rs6;
+; SM90-NEXT: cvt.f32.bf16 %r8, %rs5;
+; SM90-NEXT: cvt.f32.bf16 %r9, %rs4;
+; SM90-NEXT: cvt.f32.bf16 %r10, %rs3;
+; SM90-NEXT: cvt.f32.bf16 %r11, %rs2;
+; SM90-NEXT: cvt.f32.bf16 %r12, %rs1;
; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM90-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 64c7792a61c8c..7b2126870e319 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -596,18 +596,15 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
; CHECK-F16: {
; CHECK-F16-NEXT: .reg .pred %p<3>;
; CHECK-F16-NEXT: .reg .b32 %r<9>;
-; CHECK-F16-NEXT: .reg .b64 %rd<3>;
; CHECK-F16-EMPTY:
; CHECK-F16-NEXT: // %bb.0:
-; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
-; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
-; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2;
-; CHECK-F16-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2;
-; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1;
+; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
+; CHECK-F16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
+; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r5, %r6;
+; CHECK-F16-NEXT: selp.f32 %r7, %r2, %r4, %p2;
+; CHECK-F16-NEXT: selp.f32 %r8, %r1, %r3, %p1;
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
; CHECK-F16-NEXT: ret;
;
@@ -616,25 +613,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
; CHECK-NOF16-NEXT: .reg .pred %p<3>;
; CHECK-NOF16-NEXT: .reg .b16 %rs<5>;
; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
-; CHECK-NOF16-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
-; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
-; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs1;
-; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
-; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs4;
-; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5;
-; CHECK-NOF16-NEXT: mov.b64 {%r7, %r8}, %rd2;
-; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1;
-; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2;
-; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1;
+; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
+; CHECK-NOF16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r6;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4;
+; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9;
+; CHECK-NOF16-NEXT: selp.f32 %r11, %r2, %r4, %p2;
+; CHECK-NOF16-NEXT: selp.f32 %r12, %r1, %r3, %p1;
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11};
; CHECK-NOF16-NEXT: ret;
<2 x half> %c, <2 x half> %d) #0 {
@@ -649,17 +643,14 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<7>;
; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3;
-; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4;
+; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5;
+; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6;
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2;
@@ -1501,11 +1492,9 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2;
; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1;
; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1};
@@ -1928,12 +1917,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
; CHECK-F16: {
; CHECK-F16-NEXT: .reg .b16 %rs<3>;
; CHECK-F16-NEXT: .reg .b32 %r<8>;
-; CHECK-F16-NEXT: .reg .b64 %rd<2>;
; CHECK-F16-EMPTY:
; CHECK-F16-NEXT: // %bb.0:
-; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
-; CHECK-F16-NEXT: mov.b64 {%r2, %r3}, %rd1;
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3;
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2;
; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1};
@@ -1947,21 +1934,19 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
; CHECK-NOF16-NEXT: .reg .b32 %r<6>;
-; CHECK-NOF16-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
-; CHECK-NOF16-NEXT: mov.b64 {%r2, %r3}, %rd1;
-; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648;
-; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; }
-; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4;
-; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767;
+; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; }
+; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1;
+; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767;
+; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1;
; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648;
-; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; }
-; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7;
+; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; }
+; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767;
+; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6;
; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5};
; CHECK-NOF16-NEXT: ret;
%tb = fptrunc <2 x float> %b to <2 x half>
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index bcaefa1699d8b..7ca16f702d8f3 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -30,12 +30,10 @@ define <2 x float> @test_ret_const() #0 {
define float @test_extract_0(<2 x float> %a) #0 {
; CHECK-NOF32X2-LABEL: test_extract_0(
; CHECK-NOF32X2: {
-; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
-; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0];
; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NOF32X2-NEXT: ret;
;
@@ -56,13 +54,11 @@ define float @test_extract_0(<2 x float> %a) #0 {
define float @test_extract_1(<2 x float> %a) #0 {
; CHECK-NOF32X2-LABEL: test_extract_1(
; CHECK-NOF32X2: {
-; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
-; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
-; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0];
+; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_extract_1(
@@ -80,20 +76,42 @@ define float @test_extract_1(<2 x float> %a) #0 {
}
define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
-; CHECK-LABEL: test_extract_i(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_extract_i_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0];
-; CHECK-NEXT: setp.eq.b64 %p1, %rd2, 0;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_extract_i(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .local .align 8 .b8 __local_depot3[8];
+; CHECK-NOF32X2-NEXT: .reg .b64 %SP;
+; CHECK-NOF32X2-NEXT: .reg .b64 %SPL;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<6>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: mov.b64 %SPL, __local_depot3;
+; CHECK-NOF32X2-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_i_param_0];
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
+; CHECK-NOF32X2-NEXT: st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: and.b64 %rd2, %rd1, 1;
+; CHECK-NOF32X2-NEXT: shl.b64 %rd3, %rd2, 2;
+; CHECK-NOF32X2-NEXT: add.u64 %rd4, %SP, 0;
+; CHECK-NOF32X2-NEXT: or.b64 %rd5, %rd4, %rd3;
+; CHECK-NOF32X2-NEXT: ld.b32 %r3, [%rd5];
+; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_extract_i(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<2>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_extract_i_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0];
+; CHECK-F32X2-NEXT: setp.eq.b64 %p1, %rd2, 0;
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-F32X2-NEXT: ret;
%e = extractelement <2 x float> %a, i64 %idx
ret float %e
}
@@ -102,15 +120,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NOF32X2-LABEL: test_fadd(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -132,11 +147,9 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_0(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0];
; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000;
; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -163,11 +176,9 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_1(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0];
; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000;
; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -194,20 +205,15 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-NOF32X2-LABEL: test_fadd_v4(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r10, %r8;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r9, %r7;
-; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5};
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r9, %r4, %r8;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r10, %r3, %r7;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r2, %r6;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r1, %r5;
+; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_fadd_v4(
@@ -229,17 +235,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000;
-; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_fadd_imm_0_v4(
@@ -267,17 +270,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000;
-; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_fadd_imm_1_v4(
@@ -305,15 +305,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NOF32X2-LABEL: test_fsub(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0];
+; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r1, %r3;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -332,18 +329,29 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
}
define <2 x float> @test_fneg(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fneg(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: neg.f32 %r3, %r2;
-; CHECK-NEXT: neg.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fneg(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0];
+; CHECK-NOF32X2-NEXT: neg.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: neg.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fneg(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fneg_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: neg.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: neg.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = fneg <2 x float> %a
ret <2 x float> %r
}
@@ -352,15 +360,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NOF32X2-LABEL: test_fmul(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0];
+; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r1, %r3;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -379,50 +384,85 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
}
define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fdiv(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2;
-; CHECK-NEXT: div.rn.f32 %r6, %r3, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fdiv(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0];
+; CHECK-NOF32X2-NEXT: div.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: div.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fdiv(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: div.rn.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: div.rn.f32 %r6, %r3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-F32X2-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_frem(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<15>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5;
-; CHECK-NEXT: neg.f32 %r7, %r6;
-; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4;
-; CHECK-NEXT: testp.infinite.f32 %p1, %r2;
-; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1;
-; CHECK-NEXT: div.rn.f32 %r10, %r3, %r1;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10;
-; CHECK-NEXT: neg.f32 %r12, %r11;
-; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3;
-; CHECK-NEXT: testp.infinite.f32 %p2, %r1;
-; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_frem(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<15>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0];
+; CHECK-NOF32X2-NEXT: div.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NOF32X2-NEXT: neg.f32 %r7, %r6;
+; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2;
+; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p1, %r4;
+; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r8, %p1;
+; CHECK-NOF32X2-NEXT: div.rn.f32 %r10, %r1, %r3;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r11, %r10;
+; CHECK-NOF32X2-NEXT: neg.f32 %r12, %r11;
+; CHECK-NOF32X2-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1;
+; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p2, %r3;
+; CHECK-NOF32X2-NEXT: selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_frem(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<15>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_frem_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: div.rn.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-F32X2-NEXT: neg.f32 %r7, %r6;
+; CHECK-F32X2-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4;
+; CHECK-F32X2-NEXT: testp.infinite.f32 %p1, %r2;
+; CHECK-F32X2-NEXT: selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-F32X2-NEXT: div.rn.f32 %r10, %r3, %r1;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r11, %r10;
+; CHECK-F32X2-NEXT: neg.f32 %r12, %r11;
+; CHECK-F32X2-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3;
+; CHECK-F32X2-NEXT: testp.infinite.f32 %p2, %r1;
+; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9};
+; CHECK-F32X2-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
@@ -431,15 +471,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-NOF32X2-LABEL: test_fadd_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r1, %r3;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -461,11 +498,9 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_0_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0];
; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000;
; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -492,11 +527,9 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_1_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0];
; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000;
; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -523,20 +556,15 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
; CHECK-NOF32X2-LABEL: test_fadd_v4_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r10, %r8;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r9, %r7;
-; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5};
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r9, %r4, %r8;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r10, %r3, %r7;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r2, %r6;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r1, %r5;
+; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_fadd_v4_ftz(
@@ -558,17 +586,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000;
-; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz(
@@ -596,17 +621,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000;
-; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000;
-; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
+; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz(
@@ -634,15 +656,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-NOF32X2-LABEL: test_fsub_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0];
+; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r1, %r3;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -661,18 +680,29 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
}
define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
-; CHECK-LABEL: test_fneg_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: neg.ftz.f32 %r3, %r2;
-; CHECK-NEXT: neg.ftz.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fneg_ftz(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0];
+; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fneg_ftz(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: neg.ftz.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: neg.ftz.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = fneg <2 x float> %a
ret <2 x float> %r
}
@@ -681,15 +711,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-NOF32X2-LABEL: test_fmul_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r4, %r2;
-; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0];
+; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r1, %r3;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -711,17 +738,13 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
; CHECK-NOF32X2-LABEL: test_fma_ftz(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r6, %r4, %r2;
-; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r5, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0];
+; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r2, %r4, %r6;
+; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r1, %r3, %r5;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -741,65 +764,112 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
}
define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_fdiv_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2;
-; CHECK-NEXT: div.rn.ftz.f32 %r6, %r3, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fdiv_ftz(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0];
+; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fdiv_ftz(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r6, %r3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-F32X2-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_frem_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<15>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2;
-; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5;
-; CHECK-NEXT: neg.ftz.f32 %r7, %r6;
-; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4;
-; CHECK-NEXT: testp.infinite.f32 %p1, %r2;
-; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1;
-; CHECK-NEXT: div.rn.ftz.f32 %r10, %r3, %r1;
-; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10;
-; CHECK-NEXT: neg.ftz.f32 %r12, %r11;
-; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3;
-; CHECK-NEXT: testp.infinite.f32 %p2, %r1;
-; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_frem_ftz(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<15>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0];
+; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5;
+; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r7, %r6;
+; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2;
+; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p1, %r4;
+; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r8, %p1;
+; CHECK-NOF32X2-NEXT: div.rn.ftz.f32 %r10, %r1, %r3;
+; CHECK-NOF32X2-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10;
+; CHECK-NOF32X2-NEXT: neg.ftz.f32 %r12, %r11;
+; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1;
+; CHECK-NOF32X2-NEXT: testp.infinite.f32 %p2, %r3;
+; CHECK-NOF32X2-NEXT: selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_frem_ftz(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<15>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5;
+; CHECK-F32X2-NEXT: neg.ftz.f32 %r7, %r6;
+; CHECK-F32X2-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4;
+; CHECK-F32X2-NEXT: testp.infinite.f32 %p1, %r2;
+; CHECK-F32X2-NEXT: selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-F32X2-NEXT: div.rn.ftz.f32 %r10, %r3, %r1;
+; CHECK-F32X2-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10;
+; CHECK-F32X2-NEXT: neg.ftz.f32 %r12, %r11;
+; CHECK-F32X2-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3;
+; CHECK-F32X2-NEXT: testp.infinite.f32 %p2, %r1;
+; CHECK-F32X2-NEXT: selp.f32 %r14, %r3, %r13, %p2;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9};
+; CHECK-F32X2-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v2f32(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0];
-; CHECK-NEXT: ld.b64 %rd3, [%rd1];
-; CHECK-NEXT: st.b64 [%rd2], %rd3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_ldst_v2f32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-NOF32X2-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NOF32X2-NEXT: st.v2.b32 [%rd2], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_ldst_v2f32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-F32X2-NEXT: ld.b64 %rd3, [%rd1];
+; CHECK-F32X2-NEXT: st.b64 [%rd2], %rd3;
+; CHECK-F32X2-NEXT: ret;
%t1 = load <2 x float>, ptr %a
store <2 x float> %t1, ptr %b, align 32
ret void
@@ -825,34 +895,60 @@ define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
}
define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v4f32(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0];
-; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1];
-; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_ldst_v4f32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_ldst_v4f32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<5>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1];
+; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4};
+; CHECK-F32X2-NEXT: ret;
%t1 = load <4 x float>, ptr %a
store <4 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v8f32(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0];
-; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1];
-; CHECK-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16];
-; CHECK-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6};
-; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_ldst_v8f32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NOF32X2-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NOF32X2-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_ldst_v8f32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-F32X2-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1];
+; CHECK-F32X2-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16];
+; CHECK-F32X2-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6};
+; CHECK-F32X2-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4};
+; CHECK-F32X2-NEXT: ret;
%t1 = load <8 x float>, ptr %a
store <8 x float> %t1, ptr %b, align 32
ret void
@@ -861,571 +957,982 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_call(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0];
-; CHECK-NEXT: { // callseq 0, 0
-; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: st.param.b64 [param1], %rd2;
-; CHECK-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_call(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_param_0];
+; CHECK-NOF32X2-NEXT: { // callseq 0, 0
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r3, %r4};
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0];
+; CHECK-NOF32X2-NEXT: } // callseq 0
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_call(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_call_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_call_param_0];
+; CHECK-F32X2-NEXT: { // callseq 0, 0
+; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1);
+; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-F32X2-NEXT: } // callseq 0
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_call_flipped(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0];
-; CHECK-NEXT: { // callseq 1, 0
-; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: st.param.b64 [param1], %rd1;
-; CHECK-NEXT: st.param.b64 [param0], %rd2;
-; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-NEXT: } // callseq 1
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_call_flipped(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_flipped_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_flipped_param_0];
+; CHECK-NOF32X2-NEXT: { // callseq 1, 0
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r3, %r4};
+; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0];
+; CHECK-NOF32X2-NEXT: } // callseq 1
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_call_flipped(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0];
+; CHECK-F32X2-NEXT: { // callseq 1, 0
+; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1);
+; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-F32X2-NEXT: } // callseq 1
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
-; CHECK-NEXT: { // callseq 2, 0
-; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: st.param.b64 [param1], %rd1;
-; CHECK-NEXT: st.param.b64 [param0], %rd2;
-; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-NEXT: } // callseq 2
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_tailcall_flipped(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_tailcall_flipped_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_tailcall_flipped_param_0];
+; CHECK-NOF32X2-NEXT: { // callseq 2, 0
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NOF32X2-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param1], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [param0], {%r3, %r4};
+; CHECK-NOF32X2-NEXT: call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0];
+; CHECK-NOF32X2-NEXT: } // callseq 2
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_tailcall_flipped(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
+; CHECK-F32X2-NEXT: { // callseq 2, 0
+; CHECK-F32X2-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-F32X2-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-F32X2-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-F32X2-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-F32X2-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-F32X2-NEXT: call.uni (retval0), test_callee, (param0, param1);
+; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-F32X2-NEXT: } // callseq 2
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT: ret;
%r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
-; CHECK-LABEL: test_select(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2];
-; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0];
-; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_select(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<2>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.b8 %rs1, [test_select_param_2];
+; CHECK-NOF32X2-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NOF32X2-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_param_0];
+; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r2, %r4, %p1;
+; CHECK-NOF32X2-NEXT: selp.f32 %r6, %r1, %r3, %p1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_select(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<2>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b8 %rs1, [test_select_param_2];
+; CHECK-F32X2-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-F32X2-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_param_0];
+; CHECK-F32X2-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT: ret;
%r = select i1 %c, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
-; CHECK-LABEL: test_select_cc(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<11>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
-; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3;
-; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1;
-; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-NEXT: selp.f32 %r9, %r8, %r6, %p2;
-; CHECK-NEXT: selp.f32 %r10, %r7, %r5, %p1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_select_cc(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_3];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_2];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0];
+; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r5, %r7;
+; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r6, %r8;
+; CHECK-NOF32X2-NEXT: selp.f32 %r9, %r2, %r4, %p2;
+; CHECK-NOF32X2-NEXT: selp.f32 %r10, %r1, %r3, %p1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_select_cc(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<11>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<5>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd3;
+; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r3, %r1;
+; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r4, %r2;
+; CHECK-F32X2-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-F32X2-NEXT: selp.f32 %r9, %r8, %r6, %p2;
+; CHECK-F32X2-NEXT: selp.f32 %r10, %r7, %r5, %p1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-F32X2-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
-; CHECK-LABEL: test_select_cc_f64_f32(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1];
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0];
-; CHECK-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3];
-; CHECK-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd6;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd5;
-; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1;
-; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2;
-; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2;
-; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_select_cc_f64_f32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r1, %r3;
+; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r2, %r4;
+; CHECK-NOF32X2-NEXT: selp.f64 %rd5, %rd2, %rd4, %p2;
+; CHECK-NOF32X2-NEXT: selp.f64 %rd6, %rd1, %rd3, %p1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_select_cc_f64_f32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<9>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd6;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd5;
+; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r3, %r1;
+; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r4, %r2;
+; CHECK-F32X2-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2;
+; CHECK-F32X2-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1;
+; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7};
+; CHECK-F32X2-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
ret <2 x double> %r
}
define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
-; CHECK-LABEL: test_select_cc_f32_f64(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3];
-; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
-; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5;
-; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: selp.f32 %r5, %r4, %r2, %p2;
-; CHECK-NEXT: selp.f32 %r6, %r3, %r1, %p1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_select_cc_f32_f64(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0];
+; CHECK-NOF32X2-NEXT: setp.neu.f64 %p1, %rd1, %rd3;
+; CHECK-NOF32X2-NEXT: setp.neu.f64 %p2, %rd2, %rd4;
+; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r2, %r4, %p2;
+; CHECK-NOF32X2-NEXT: selp.f32 %r6, %r1, %r3, %p1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_select_cc_f32_f64(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3];
+; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
+; CHECK-F32X2-NEXT: setp.neu.f64 %p1, %rd3, %rd5;
+; CHECK-F32X2-NEXT: setp.neu.f64 %p2, %rd4, %rd6;
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: selp.f32 %r5, %r4, %r2, %p2;
+; CHECK-F32X2-NEXT: selp.f32 %r6, %r3, %r1, %p1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-F32X2-NEXT: ret;
%cc = fcmp une <2 x double> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_une(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.neu.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.neu.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_une(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0];
+; CHECK-NOF32X2-NEXT: setp.neu.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.neu.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_une(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.neu.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.neu.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp une <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.equ.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.equ.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ueq(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0];
+; CHECK-NOF32X2-NEXT: setp.equ.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.equ.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ueq(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.equ.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.equ.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ueq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.gtu.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.gtu.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ugt(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0];
+; CHECK-NOF32X2-NEXT: setp.gtu.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.gtu.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ugt(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.gtu.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.gtu.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ugt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.geu.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.geu.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_uge(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0];
+; CHECK-NOF32X2-NEXT: setp.geu.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.geu.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_uge(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.geu.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.geu.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp uge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.ltu.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.ltu.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ult(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0];
+; CHECK-NOF32X2-NEXT: setp.ltu.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.ltu.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ult(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.ltu.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.ltu.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ult <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.leu.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.leu.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ule(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0];
+; CHECK-NOF32X2-NEXT: setp.leu.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.leu.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ule(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.leu.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.leu.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ule <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.nan.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_uno(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0];
+; CHECK-NOF32X2-NEXT: setp.nan.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.nan.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_uno(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.nan.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.nan.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp uno <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_one(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.ne.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.ne.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_one(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0];
+; CHECK-NOF32X2-NEXT: setp.ne.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.ne.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_one(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.ne.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.ne.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp one <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.eq.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.eq.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_oeq(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0];
+; CHECK-NOF32X2-NEXT: setp.eq.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.eq.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_oeq(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.eq.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.eq.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp oeq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.gt.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.gt.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ogt(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0];
+; CHECK-NOF32X2-NEXT: setp.gt.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.gt.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ogt(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.gt.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.gt.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ogt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.ge.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.ge.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_oge(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0];
+; CHECK-NOF32X2-NEXT: setp.ge.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.ge.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_oge(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.ge.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.ge.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp oge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_olt(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.lt.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.lt.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_olt(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0];
+; CHECK-NOF32X2-NEXT: setp.lt.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.lt.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_olt(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.lt.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.lt.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp olt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ole(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.le.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.le.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ole(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0];
+; CHECK-NOF32X2-NEXT: setp.le.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.le.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ole(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.le.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.le.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ole <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: setp.num.f32 %p1, %r4, %r2;
-; CHECK-NEXT: setp.num.f32 %p2, %r3, %r1;
-; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fcmp_ord(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0];
+; CHECK-NOF32X2-NEXT: setp.num.f32 %p1, %r2, %r4;
+; CHECK-NOF32X2-NEXT: setp.num.f32 %p2, %r1, %r3;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NOF32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NOF32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fcmp_ord(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: setp.num.f32 %p1, %r4, %r2;
+; CHECK-F32X2-NEXT: setp.num.f32 %p2, %r3, %r1;
+; CHECK-F32X2-NEXT: selp.b16 %rs1, -1, 0, %p2;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-F32X2-NEXT: selp.b16 %rs2, -1, 0, %p1;
+; CHECK-F32X2-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F32X2-NEXT: ret;
%r = fcmp ord <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fptosi_i32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rzi.s32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fptosi_i32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.s32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2;
-; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fptosi_i64(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rzi.s64.f32 %rd1, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rzi.s64.f32 %rd2, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fptosi_i64(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd2, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.s64.f32 %rd3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-F32X2-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptoui_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fptoui_2xi32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rzi.u32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fptoui_2xi32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.u32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptoui_2xi64(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2;
-; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fptoui_2xi64(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rzi.u64.f32 %rd1, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rzi.u64.f32 %rd2, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fptoui_2xi64(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd2, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.u64.f32 %rd3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-F32X2-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
@@ -1496,16 +2003,14 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_fadd(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_uitofp_2xi32_fadd_param_1];
; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r1;
-; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, %r4;
-; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, %r3;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r5, %r1;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r6, %r2;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r4, %r6;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r3, %r5;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -1529,48 +2034,81 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
}
define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
-; CHECK-LABEL: test_fptrunc_2xdouble(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0];
-; CHECK-NEXT: cvt.rn.f32.f64 %r1, %rd2;
-; CHECK-NEXT: cvt.rn.f32.f64 %r2, %rd1;
-; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fptrunc_2xdouble(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.f64 %r1, %rd2;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.f64 %r2, %rd1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fptrunc_2xdouble(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-F32X2-NEXT: cvt.rn.f32.f64 %r1, %rd2;
+; CHECK-F32X2-NEXT: cvt.rn.f32.f64 %r2, %rd1;
+; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT: ret;
%r = fptrunc <2 x double> %a to <2 x float>
ret <2 x float> %r
}
define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fpext_2xdouble(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.f64.f32 %rd2, %r2;
-; CHECK-NEXT: cvt.f64.f32 %rd3, %r1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fpext_2xdouble(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0];
+; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd1, %r2;
+; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd2, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fpext_2xdouble(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.f64.f32 %rd2, %r2;
+; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-F32X2-NEXT: ret;
%r = fpext <2 x float> %a to <2 x double>
ret <2 x double> %r
}
define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0];
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-F32X2-NEXT: ret;
%r = bitcast <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
@@ -1602,31 +2140,51 @@ define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
}
define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcast_2xfloat_to_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0];
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-F32X2-NEXT: ret;
%r = bitcast <2 x float> %a to double
ret double %r
}
define <2 x float> @test_sqrt(<2 x float> %a) #0 {
-; CHECK-LABEL: test_sqrt(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: sqrt.rn.f32 %r3, %r2;
-; CHECK-NEXT: sqrt.rn.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_sqrt(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0];
+; CHECK-NOF32X2-NEXT: sqrt.rn.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: sqrt.rn.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_sqrt(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: sqrt.rn.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: sqrt.rn.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.sqrt(<2 x float> %a)
ret <2 x float> %r
}
@@ -1639,35 +2197,57 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 {
;}
define <2 x float> @test_sin(<2 x float> %a) #0 {
-; CHECK-LABEL: test_sin(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_sin_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: sin.approx.f32 %r3, %r2;
-; CHECK-NEXT: sin.approx.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_sin(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0];
+; CHECK-NOF32X2-NEXT: sin.approx.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: sin.approx.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_sin(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sin_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: sin.approx.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: sin.approx.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call afn <2 x float> @llvm.sin(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_cos(<2 x float> %a) #0 {
-; CHECK-LABEL: test_cos(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_cos_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cos.approx.f32 %r3, %r2;
-; CHECK-NEXT: cos.approx.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_cos(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0];
+; CHECK-NOF32X2-NEXT: cos.approx.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cos.approx.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_cos(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_cos_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cos.approx.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cos.approx.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call afn <2 x float> @llvm.cos(<2 x float> %a)
ret <2 x float> %r
}
@@ -1719,17 +2299,13 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
; CHECK-NOF32X2-LABEL: test_fma(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_param_2];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
-; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0];
+; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6;
+; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -1749,266 +2325,448 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
}
define <2 x float> @test_fabs(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fabs(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fabs_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: abs.f32 %r3, %r2;
-; CHECK-NEXT: abs.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_fabs(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0];
+; CHECK-NOF32X2-NEXT: abs.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: abs.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_fabs(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fabs_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: abs.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: abs.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.fabs(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_minnum(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_minnum_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_minnum_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: min.f32 %r5, %r4, %r2;
-; CHECK-NEXT: min.f32 %r6, %r3, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_minnum(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0];
+; CHECK-NOF32X2-NEXT: min.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: min.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_minnum(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_minnum_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_minnum_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: min.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: min.f32 %r6, %r3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_maxnum(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: max.f32 %r5, %r4, %r2;
-; CHECK-NEXT: max.f32 %r6, %r3, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_maxnum(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0];
+; CHECK-NOF32X2-NEXT: max.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: max.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_maxnum(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-F32X2-NEXT: max.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: max.f32 %r6, %r3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_copysign(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NEXT: copysign.f32 %r5, %r4, %r2;
-; CHECK-NEXT: copysign.f32 %r6, %r3, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_copysign(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0];
+; CHECK-NOF32X2-NEXT: copysign.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT: copysign.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_copysign(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_copysign_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-F32X2-NEXT: copysign.f32 %r5, %r4, %r2;
+; CHECK-F32X2-NEXT: copysign.f32 %r6, %r3, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_copysign_f64(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0];
-; CHECK-NEXT: shr.u64 %rd4, %rd3, 63;
-; CHECK-NEXT: and.b64 %rd5, %rd4, 1;
-; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: abs.f32 %r3, %r2;
-; CHECK-NEXT: neg.f32 %r4, %r3;
-; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1;
-; CHECK-NEXT: shr.u64 %rd6, %rd2, 63;
-; CHECK-NEXT: and.b64 %rd7, %rd6, 1;
-; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0;
-; CHECK-NEXT: abs.f32 %r6, %r1;
-; CHECK-NEXT: neg.f32 %r7, %r6;
-; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_copysign_f64(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<7>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0];
+; CHECK-NOF32X2-NEXT: abs.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: neg.f32 %r4, %r3;
+; CHECK-NOF32X2-NEXT: shr.u64 %rd3, %rd2, 63;
+; CHECK-NOF32X2-NEXT: and.b64 %rd4, %rd3, 1;
+; CHECK-NOF32X2-NEXT: setp.ne.b64 %p1, %rd4, 0;
+; CHECK-NOF32X2-NEXT: selp.f32 %r5, %r4, %r3, %p1;
+; CHECK-NOF32X2-NEXT: abs.f32 %r6, %r1;
+; CHECK-NOF32X2-NEXT: neg.f32 %r7, %r6;
+; CHECK-NOF32X2-NEXT: shr.u64 %rd5, %rd1, 63;
+; CHECK-NOF32X2-NEXT: and.b64 %rd6, %rd5, 1;
+; CHECK-NOF32X2-NEXT: setp.ne.b64 %p2, %rd6, 0;
+; CHECK-NOF32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_copysign_f64(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<3>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<8>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0];
+; CHECK-F32X2-NEXT: shr.u64 %rd4, %rd3, 63;
+; CHECK-F32X2-NEXT: and.b64 %rd5, %rd4, 1;
+; CHECK-F32X2-NEXT: setp.ne.b64 %p1, %rd5, 0;
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: abs.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: neg.f32 %r4, %r3;
+; CHECK-F32X2-NEXT: selp.f32 %r5, %r4, %r3, %p1;
+; CHECK-F32X2-NEXT: shr.u64 %rd6, %rd2, 63;
+; CHECK-F32X2-NEXT: and.b64 %rd7, %rd6, 1;
+; CHECK-F32X2-NEXT: setp.ne.b64 %p2, %rd7, 0;
+; CHECK-F32X2-NEXT: abs.f32 %r6, %r1;
+; CHECK-F32X2-NEXT: neg.f32 %r7, %r6;
+; CHECK-F32X2-NEXT: selp.f32 %r8, %r7, %r6, %p2;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5};
+; CHECK-F32X2-NEXT: ret;
%tb = fptrunc <2 x double> %b to <2 x float>
%r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb)
ret <2 x float> %r
}
define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_copysign_extended(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NEXT: copysign.f32 %r5, %r3, %r1;
-; CHECK-NEXT: copysign.f32 %r6, %r4, %r2;
-; CHECK-NEXT: cvt.f64.f32 %rd3, %r6;
-; CHECK-NEXT: cvt.f64.f32 %rd4, %r5;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_copysign_extended(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0];
+; CHECK-NOF32X2-NEXT: copysign.f32 %r5, %r3, %r1;
+; CHECK-NOF32X2-NEXT: copysign.f32 %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd1, %r6;
+; CHECK-NOF32X2-NEXT: cvt.f64.f32 %rd2, %r5;
+; CHECK-NOF32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_copysign_extended(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<5>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-F32X2-NEXT: copysign.f32 %r5, %r3, %r1;
+; CHECK-F32X2-NEXT: copysign.f32 %r6, %r4, %r2;
+; CHECK-F32X2-NEXT: cvt.f64.f32 %rd3, %r6;
+; CHECK-F32X2-NEXT: cvt.f64.f32 %rd4, %r5;
+; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b)
%xr = fpext <2 x float> %r to <2 x double>
ret <2 x double> %xr
}
define <2 x float> @test_floor(<2 x float> %a) #0 {
-; CHECK-LABEL: test_floor(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_floor_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_floor(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rmi.f32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_floor(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_floor_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rmi.f32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.floor(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_ceil(<2 x float> %a) #0 {
-; CHECK-LABEL: test_ceil(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_ceil_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_ceil(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rpi.f32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_ceil(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_ceil_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rpi.f32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.ceil(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_trunc(<2 x float> %a) #0 {
-; CHECK-LABEL: test_trunc(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_trunc(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_trunc(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.trunc(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_rint(<2 x float> %a) #0 {
-; CHECK-LABEL: test_rint(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_rint_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_rint(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_rint(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_rint_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.rint(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_nearbyint(<2 x float> %a) #0 {
-; CHECK-LABEL: test_nearbyint(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_nearbyint(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_nearbyint(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.nearbyint(<2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_roundeven(<2 x float> %a) #0 {
-; CHECK-LABEL: test_roundeven(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2;
-; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_roundeven(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rni.f32.f32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_roundeven(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rni.f32.f32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.roundeven(<2 x float> %a)
ret <2 x float> %r
}
; check the use of sign mask and 0.5 to implement round
define <2 x float> @test_round(<2 x float> %a) #0 {
-; CHECK-LABEL: test_round(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<19>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_round_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: and.b32 %r3, %r2, -2147483648;
-; CHECK-NEXT: or.b32 %r4, %r3, 1056964608;
-; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5;
-; CHECK-NEXT: abs.f32 %r7, %r2;
-; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2;
-; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2;
-; CHECK-NEXT: and.b32 %r11, %r1, -2147483648;
-; CHECK-NEXT: or.b32 %r12, %r11, 1056964608;
-; CHECK-NEXT: add.rn.f32 %r13, %r1, %r12;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r14, %r13;
-; CHECK-NEXT: abs.f32 %r15, %r1;
-; CHECK-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r16, %r1, %r14, %p3;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r17, %r1;
-; CHECK-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r18, %r17, %r16, %p4;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_round(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .pred %p<5>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<19>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0];
+; CHECK-NOF32X2-NEXT: and.b32 %r3, %r2, -2147483648;
+; CHECK-NOF32X2-NEXT: or.b32 %r4, %r3, 1056964608;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NOF32X2-NEXT: abs.f32 %r7, %r2;
+; CHECK-NOF32X2-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000;
+; CHECK-NOF32X2-NEXT: selp.f32 %r8, %r2, %r6, %p1;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r9, %r2;
+; CHECK-NOF32X2-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000;
+; CHECK-NOF32X2-NEXT: selp.f32 %r10, %r9, %r8, %p2;
+; CHECK-NOF32X2-NEXT: and.b32 %r11, %r1, -2147483648;
+; CHECK-NOF32X2-NEXT: or.b32 %r12, %r11, 1056964608;
+; CHECK-NOF32X2-NEXT: add.rn.f32 %r13, %r1, %r12;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r14, %r13;
+; CHECK-NOF32X2-NEXT: abs.f32 %r15, %r1;
+; CHECK-NOF32X2-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000;
+; CHECK-NOF32X2-NEXT: selp.f32 %r16, %r1, %r14, %p3;
+; CHECK-NOF32X2-NEXT: cvt.rzi.f32.f32 %r17, %r1;
+; CHECK-NOF32X2-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000;
+; CHECK-NOF32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_round(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .pred %p<5>;
+; CHECK-F32X2-NEXT: .reg .b32 %r<19>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_round_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: and.b32 %r3, %r2, -2147483648;
+; CHECK-F32X2-NEXT: or.b32 %r4, %r3, 1056964608;
+; CHECK-F32X2-NEXT: add.rn.f32 %r5, %r2, %r4;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-F32X2-NEXT: abs.f32 %r7, %r2;
+; CHECK-F32X2-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000;
+; CHECK-F32X2-NEXT: selp.f32 %r8, %r2, %r6, %p1;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r9, %r2;
+; CHECK-F32X2-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000;
+; CHECK-F32X2-NEXT: selp.f32 %r10, %r9, %r8, %p2;
+; CHECK-F32X2-NEXT: and.b32 %r11, %r1, -2147483648;
+; CHECK-F32X2-NEXT: or.b32 %r12, %r11, 1056964608;
+; CHECK-F32X2-NEXT: add.rn.f32 %r13, %r1, %r12;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r14, %r13;
+; CHECK-F32X2-NEXT: abs.f32 %r15, %r1;
+; CHECK-F32X2-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000;
+; CHECK-F32X2-NEXT: selp.f32 %r16, %r1, %r14, %p3;
+; CHECK-F32X2-NEXT: cvt.rzi.f32.f32 %r17, %r1;
+; CHECK-F32X2-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000;
+; CHECK-F32X2-NEXT: selp.f32 %r18, %r17, %r16, %p4;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10};
+; CHECK-F32X2-NEXT: ret;
%r = call <2 x float> @llvm.round(<2 x float> %a)
ret <2 x float> %r
}
@@ -2017,17 +2775,13 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
; CHECK-NOF32X2-LABEL: test_fmuladd(
; CHECK-NOF32X2: {
; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0];
-; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
-; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1;
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1];
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0];
+; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6;
+; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5;
; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
; CHECK-NOF32X2-NEXT: ret;
;
@@ -2047,16 +2801,25 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
}
define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
-; CHECK-LABEL: test_shufflevector(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_shufflevector(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_shufflevector(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-F32X2-NEXT: ret;
%s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> <i32 1, i32 0>
ret <2 x float> %s
}
@@ -2064,14 +2827,12 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 {
; CHECK-NOF32X2-LABEL: test_insertelement(
; CHECK-NOF32X2: {
-; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
-; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>;
; CHECK-NOF32X2-EMPTY:
; CHECK-NOF32X2-NEXT: // %bb.0:
-; CHECK-NOF32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1];
-; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0];
-; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; }
-; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_insertelement_param_0];
+; CHECK-NOF32X2-NEXT: ld.param.b32 %r3, [test_insertelement_param_1];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r3};
; CHECK-NOF32X2-NEXT: ret;
;
; CHECK-F32X2-LABEL: test_insertelement(
@@ -2120,36 +2881,60 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
}
define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) {
-; CHECK-LABEL: test_trunc_to_v2bf16(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1;
-; CHECK-NEXT: st.b32 [%rd2], %r3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_trunc_to_v2bf16(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0];
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_1];
+; CHECK-NOF32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NOF32X2-NEXT: st.b32 [%rd1], %r3;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_trunc_to_v2bf16(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3;
+; CHECK-F32X2-NEXT: ret;
%trunc = fptrunc <2 x float> %a to <2 x bfloat>
store <2 x bfloat> %trunc, ptr %p
ret void
}
define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) {
-; CHECK-LABEL: test_trunc_to_v2f16(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1;
-; CHECK-NEXT: st.b32 [%rd2], %r3;
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_trunc_to_v2f16(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0];
+; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_1];
+; CHECK-NOF32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1;
+; CHECK-NOF32X2-NEXT: st.b32 [%rd1], %r3;
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_trunc_to_v2f16(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1;
+; CHECK-F32X2-NEXT: st.b32 [%rd2], %r3;
+; CHECK-F32X2-NEXT: ret;
%trunc = fptrunc <2 x float> %a to <2 x half>
store <2 x half> %trunc, ptr %p
ret void
diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 3ac8f65ff858b..cb1d12661ed64 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -7,16 +7,17 @@ declare <4 x float> @bar()
define void @foo(ptr %ptr) {
; CHECK-LABEL: foo(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 16 .b8 retval0[16];
; CHECK-NEXT: call.uni (retval0), bar, ();
-; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [retval0];
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0];
; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3};
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%val = tail call <4 x float> @bar()
store <4 x float> %val, ptr %ptr
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
index 68c53cde7f9ac..a846607d816c5 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
@@ -137,18 +137,32 @@ define void @generic_4xi64(ptr %a, ptr %b) {
}
define void @generic_8xfloat(ptr %a, ptr %b) {
-; CHECK-LABEL: generic_8xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
-; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1];
-; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
-; CHECK-NEXT: ret;
+; SM90-LABEL: generic_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
+; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1];
+; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: generic_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
+; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1];
+; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr %a
store <8 x float> %a.load, ptr %b
ret void
@@ -288,18 +302,32 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) {
}
define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
-; CHECK-LABEL: generic_volatile_8xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
-; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
-; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
-; CHECK-NEXT: ret;
+; SM90-LABEL: generic_volatile_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
+; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1];
+; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: generic_volatile_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
+; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
+; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr %a
store volatile <8 x float> %a.load, ptr %b
ret void
@@ -514,15 +542,16 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_8xfloat(
; SM90: {
-; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0];
-; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
-; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; SM90-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1];
-; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
-; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3};
+; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1];
+; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_8xfloat(
@@ -758,15 +787,16 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_8xfloat(
; SM90: {
-; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
-; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
-; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
-; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
-; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
+; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
+; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_8xfloat(
@@ -931,18 +961,32 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
}
define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
-; CHECK-LABEL: shared_8xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
-; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1];
-; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
-; CHECK-NEXT: ret;
+; SM90-LABEL: shared_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
+; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1];
+; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: shared_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
+; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1];
+; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr addrspace(3) %a
store <8 x float> %a.load, ptr addrspace(3) %b
ret void
@@ -1082,18 +1126,32 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
}
define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
-; CHECK-LABEL: shared_volatile_8xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
-; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
-; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
-; CHECK-NEXT: ret;
+; SM90-LABEL: shared_volatile_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
+; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1];
+; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: shared_volatile_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
+; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
+; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr addrspace(3) %a
store volatile <8 x float> %a.load, ptr addrspace(3) %b
ret void
@@ -1235,18 +1293,32 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
}
define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
-; CHECK-LABEL: local_8xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
-; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1];
-; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
-; CHECK-NEXT: ret;
+; SM90-LABEL: local_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
+; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1];
+; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: local_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
+; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1];
+; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr addrspace(5) %a
store <8 x float> %a.load, ptr addrspace(5) %b
ret void
@@ -1386,18 +1458,32 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
}
define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
-; CHECK-LABEL: local_volatile_8xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
-; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
-; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
-; CHECK-NEXT: ret;
+; SM90-LABEL: local_volatile_8xfloat(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
+; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1];
+; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: local_volatile_8xfloat(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
+; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
+; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr addrspace(5) %a
store volatile <8 x float> %a.load, ptr addrspace(5) %b
ret void
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index d542fa58684a1..7553c727b09c5 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -333,28 +333,30 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo11(
; SM20: {
-; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-NEXT: .reg .b32 %r<3>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; SM20-NEXT: ld.global.b64 %rd5, [%rd2];
-; SM20-NEXT: st.global.b64 [%rd4], %rd5;
+; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2];
+; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo11(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<6>;
+; SM35-NEXT: .reg .b32 %r<3>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2];
-; SM35-NEXT: st.global.b64 [%rd4], %rd5;
+; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
+; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
; SM35-NEXT: ret;
%1 = load <2 x float>, ptr %from
store <2 x float> %1, ptr %to
@@ -494,28 +496,30 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo16(
; SM20: {
-; SM20-NEXT: .reg .b64 %rd<7>;
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
-; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
+; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
+; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo16(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<7>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
-; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
+; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
+; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
; SM35-NEXT: ret;
%1 = load <4 x float>, ptr %from
store <4 x float> %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index dfdb33852305b..0039370e6dcf5 100644
--- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -8,55 +8,52 @@ target triple = "nvptx64-nvidia-cuda"
define <4 x float> @t1(ptr %p1) {
; CHECK-LABEL: t1(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<46>;
+; CHECK-NEXT: .reg .b32 %r<41>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0];
-; CHECK-NEXT: ld.b8 %rd2, [%rd1+8];
-; CHECK-NEXT: ld.b8 %rd3, [%rd1+9];
-; CHECK-NEXT: shl.b64 %rd4, %rd3, 8;
-; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2;
-; CHECK-NEXT: ld.b8 %rd6, [%rd1+10];
-; CHECK-NEXT: shl.b64 %rd7, %rd6, 16;
-; CHECK-NEXT: ld.b8 %rd8, [%rd1+11];
-; CHECK-NEXT: shl.b64 %rd9, %rd8, 24;
-; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7;
-; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5;
-; CHECK-NEXT: ld.b8 %rd12, [%rd1+12];
-; CHECK-NEXT: ld.b8 %rd13, [%rd1+13];
-; CHECK-NEXT: shl.b64 %rd14, %rd13, 8;
-; CHECK-NEXT: or.b64 %rd15, %rd14, %rd12;
-; CHECK-NEXT: ld.b8 %rd16, [%rd1+14];
-; CHECK-NEXT: shl.b64 %rd17, %rd16, 16;
-; CHECK-NEXT: ld.b8 %rd18, [%rd1+15];
-; CHECK-NEXT: shl.b64 %rd19, %rd18, 24;
-; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17;
-; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15;
-; CHECK-NEXT: shl.b64 %rd22, %rd21, 32;
-; CHECK-NEXT: or.b64 %rd23, %rd22, %rd11;
-; CHECK-NEXT: ld.b8 %rd24, [%rd1];
-; CHECK-NEXT: ld.b8 %rd25, [%rd1+1];
-; CHECK-NEXT: shl.b64 %rd26, %rd25, 8;
-; CHECK-NEXT: or.b64 %rd27, %rd26, %rd24;
-; CHECK-NEXT: ld.b8 %rd28, [%rd1+2];
-; CHECK-NEXT: shl.b64 %rd29, %rd28, 16;
-; CHECK-NEXT: ld.b8 %rd30, [%rd1+3];
-; CHECK-NEXT: shl.b64 %rd31, %rd30, 24;
-; CHECK-NEXT: or.b64 %rd32, %rd31, %rd29;
-; CHECK-NEXT: or.b64 %rd33, %rd32, %rd27;
-; CHECK-NEXT: ld.b8 %rd34, [%rd1+4];
-; CHECK-NEXT: ld.b8 %rd35, [%rd1+5];
-; CHECK-NEXT: shl.b64 %rd36, %rd35, 8;
-; CHECK-NEXT: or.b64 %rd37, %rd36, %rd34;
-; CHECK-NEXT: ld.b8 %rd38, [%rd1+6];
-; CHECK-NEXT: shl.b64 %rd39, %rd38, 16;
-; CHECK-NEXT: ld.b8 %rd40, [%rd1+7];
-; CHECK-NEXT: shl.b64 %rd41, %rd40, 24;
-; CHECK-NEXT: or.b64 %rd42, %rd41, %rd39;
-; CHECK-NEXT: or.b64 %rd43, %rd42, %rd37;
-; CHECK-NEXT: shl.b64 %rd44, %rd43, 32;
-; CHECK-NEXT: or.b64 %rd45, %rd44, %rd33;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd45, %rd23};
+; CHECK-NEXT: ld.b8 %r1, [%rd1+12];
+; CHECK-NEXT: ld.b8 %r2, [%rd1+13];
+; CHECK-NEXT: shl.b32 %r3, %r2, 8;
+; CHECK-NEXT: or.b32 %r4, %r3, %r1;
+; CHECK-NEXT: ld.b8 %r5, [%rd1+14];
+; CHECK-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-NEXT: ld.b8 %r7, [%rd1+15];
+; CHECK-NEXT: shl.b32 %r8, %r7, 24;
+; CHECK-NEXT: or.b32 %r9, %r8, %r6;
+; CHECK-NEXT: or.b32 %r10, %r9, %r4;
+; CHECK-NEXT: ld.b8 %r11, [%rd1+8];
+; CHECK-NEXT: ld.b8 %r12, [%rd1+9];
+; CHECK-NEXT: shl.b32 %r13, %r12, 8;
+; CHECK-NEXT: or.b32 %r14, %r13, %r11;
+; CHECK-NEXT: ld.b8 %r15, [%rd1+10];
+; CHECK-NEXT: shl.b32 %r16, %r15, 16;
+; CHECK-NEXT: ld.b8 %r17, [%rd1+11];
+; CHECK-NEXT: shl.b32 %r18, %r17, 24;
+; CHECK-NEXT: or.b32 %r19, %r18, %r16;
+; CHECK-NEXT: or.b32 %r20, %r19, %r14;
+; CHECK-NEXT: ld.b8 %r21, [%rd1+4];
+; CHECK-NEXT: ld.b8 %r22, [%rd1+5];
+; CHECK-NEXT: shl.b32 %r23, %r22, 8;
+; CHECK-NEXT: or.b32 %r24, %r23, %r21;
+; CHECK-NEXT: ld.b8 %r25, [%rd1+6];
+; CHECK-NEXT: shl.b32 %r26, %r25, 16;
+; CHECK-NEXT: ld.b8 %r27, [%rd1+7];
+; CHECK-NEXT: shl.b32 %r28, %r27, 24;
+; CHECK-NEXT: or.b32 %r29, %r28, %r26;
+; CHECK-NEXT: or.b32 %r30, %r29, %r24;
+; CHECK-NEXT: ld.b8 %r31, [%rd1];
+; CHECK-NEXT: ld.b8 %r32, [%rd1+1];
+; CHECK-NEXT: shl.b32 %r33, %r32, 8;
+; CHECK-NEXT: or.b32 %r34, %r33, %r31;
+; CHECK-NEXT: ld.b8 %r35, [%rd1+2];
+; CHECK-NEXT: shl.b32 %r36, %r35, 16;
+; CHECK-NEXT: ld.b8 %r37, [%rd1+3];
+; CHECK-NEXT: shl.b32 %r38, %r37, 24;
+; CHECK-NEXT: or.b32 %r39, %r38, %r36;
+; CHECK-NEXT: or.b32 %r40, %r39, %r34;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r30, %r20, %r10};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 1
ret <4 x float> %r
@@ -65,19 +62,16 @@ define <4 x float> @t1(ptr %p1) {
define <4 x float> @t2(ptr %p1) {
; CHECK-LABEL: t2(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<10>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0];
-; CHECK-NEXT: ld.b32 %rd2, [%rd1+8];
-; CHECK-NEXT: ld.b32 %rd3, [%rd1+12];
-; CHECK-NEXT: shl.b64 %rd4, %rd3, 32;
-; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2;
-; CHECK-NEXT: ld.b32 %rd6, [%rd1];
-; CHECK-NEXT: ld.b32 %rd7, [%rd1+4];
-; CHECK-NEXT: shl.b64 %rd8, %rd7, 32;
-; CHECK-NEXT: or.b64 %rd9, %rd8, %rd6;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd5};
+; CHECK-NEXT: ld.b32 %r1, [%rd1+12];
+; CHECK-NEXT: ld.b32 %r2, [%rd1+8];
+; CHECK-NEXT: ld.b32 %r3, [%rd1+4];
+; CHECK-NEXT: ld.b32 %r4, [%rd1];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 4
ret <4 x float> %r
@@ -86,13 +80,14 @@ define <4 x float> @t2(ptr %p1) {
define <4 x float> @t3(ptr %p1) {
; CHECK-LABEL: t3(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0];
-; CHECK-NEXT: ld.b64 %rd2, [%rd1+8];
-; CHECK-NEXT: ld.b64 %rd3, [%rd1];
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1+8];
+; CHECK-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r4, %r1, %r2};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 8
ret <4 x float> %r
@@ -101,12 +96,13 @@ define <4 x float> @t3(ptr %p1) {
define <4 x float> @t4(ptr %p1) {
; CHECK-LABEL: t4(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0];
-; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3};
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 16
ret <4 x float> %r
@@ -189,40 +185,43 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
define void @s1(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s1(
; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0];
-; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1];
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1];
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r4;
+; CHECK-NEXT: st.b8 [%rd1+12], %rd2;
+; CHECK-NEXT: cvt.u64.u32 %rd3, %r3;
; CHECK-NEXT: st.b8 [%rd1+8], %rd3;
-; CHECK-NEXT: st.b8 [%rd1], %rd2;
-; CHECK-NEXT: shr.u64 %rd4, %rd3, 56;
-; CHECK-NEXT: st.b8 [%rd1+15], %rd4;
-; CHECK-NEXT: shr.u64 %rd5, %rd3, 48;
-; CHECK-NEXT: st.b8 [%rd1+14], %rd5;
-; CHECK-NEXT: shr.u64 %rd6, %rd3, 40;
-; CHECK-NEXT: st.b8 [%rd1+13], %rd6;
-; CHECK-NEXT: shr.u64 %rd7, %rd3, 32;
-; CHECK-NEXT: st.b8 [%rd1+12], %rd7;
-; CHECK-NEXT: shr.u64 %rd8, %rd3, 24;
-; CHECK-NEXT: st.b8 [%rd1+11], %rd8;
-; CHECK-NEXT: shr.u64 %rd9, %rd3, 16;
-; CHECK-NEXT: st.b8 [%rd1+10], %rd9;
-; CHECK-NEXT: shr.u64 %rd10, %rd3, 8;
-; CHECK-NEXT: st.b8 [%rd1+9], %rd10;
-; CHECK-NEXT: shr.u64 %rd11, %rd2, 56;
-; CHECK-NEXT: st.b8 [%rd1+7], %rd11;
-; CHECK-NEXT: shr.u64 %rd12, %rd2, 48;
-; CHECK-NEXT: st.b8 [%rd1+6], %rd12;
-; CHECK-NEXT: shr.u64 %rd13, %rd2, 40;
-; CHECK-NEXT: st.b8 [%rd1+5], %rd13;
-; CHECK-NEXT: shr.u64 %rd14, %rd2, 32;
-; CHECK-NEXT: st.b8 [%rd1+4], %rd14;
-; CHECK-NEXT: shr.u64 %rd15, %rd2, 24;
+; CHECK-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-NEXT: st.b8 [%rd1+4], %rd4;
+; CHECK-NEXT: cvt.u64.u32 %rd5, %r1;
+; CHECK-NEXT: st.b8 [%rd1], %rd5;
+; CHECK-NEXT: shr.u64 %rd6, %rd2, 24;
+; CHECK-NEXT: st.b8 [%rd1+15], %rd6;
+; CHECK-NEXT: shr.u64 %rd7, %rd2, 16;
+; CHECK-NEXT: st.b8 [%rd1+14], %rd7;
+; CHECK-NEXT: shr.u64 %rd8, %rd2, 8;
+; CHECK-NEXT: st.b8 [%rd1+13], %rd8;
+; CHECK-NEXT: shr.u64 %rd9, %rd3, 24;
+; CHECK-NEXT: st.b8 [%rd1+11], %rd9;
+; CHECK-NEXT: shr.u64 %rd10, %rd3, 16;
+; CHECK-NEXT: st.b8 [%rd1+10], %rd10;
+; CHECK-NEXT: shr.u64 %rd11, %rd3, 8;
+; CHECK-NEXT: st.b8 [%rd1+9], %rd11;
+; CHECK-NEXT: shr.u64 %rd12, %rd4, 24;
+; CHECK-NEXT: st.b8 [%rd1+7], %rd12;
+; CHECK-NEXT: shr.u64 %rd13, %rd4, 16;
+; CHECK-NEXT: st.b8 [%rd1+6], %rd13;
+; CHECK-NEXT: shr.u64 %rd14, %rd4, 8;
+; CHECK-NEXT: st.b8 [%rd1+5], %rd14;
+; CHECK-NEXT: shr.u64 %rd15, %rd5, 24;
; CHECK-NEXT: st.b8 [%rd1+3], %rd15;
-; CHECK-NEXT: shr.u64 %rd16, %rd2, 16;
+; CHECK-NEXT: shr.u64 %rd16, %rd5, 16;
; CHECK-NEXT: st.b8 [%rd1+2], %rd16;
-; CHECK-NEXT: shr.u64 %rd17, %rd2, 8;
+; CHECK-NEXT: shr.u64 %rd17, %rd5, 8;
; CHECK-NEXT: st.b8 [%rd1+1], %rd17;
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 1
@@ -232,17 +231,16 @@ define void @s1(ptr %p1, <4 x float> %v) {
define void @s2(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s2(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0];
-; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1];
-; CHECK-NEXT: st.b32 [%rd1+8], %rd3;
-; CHECK-NEXT: st.b32 [%rd1], %rd2;
-; CHECK-NEXT: shr.u64 %rd4, %rd3, 32;
-; CHECK-NEXT: st.b32 [%rd1+12], %rd4;
-; CHECK-NEXT: shr.u64 %rd5, %rd2, 32;
-; CHECK-NEXT: st.b32 [%rd1+4], %rd5;
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s2_param_1];
+; CHECK-NEXT: st.b32 [%rd1+12], %r4;
+; CHECK-NEXT: st.b32 [%rd1+8], %r3;
+; CHECK-NEXT: st.b32 [%rd1+4], %r2;
+; CHECK-NEXT: st.b32 [%rd1], %r1;
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 4
ret void
@@ -251,13 +249,14 @@ define void @s2(ptr %p1, <4 x float> %v) {
define void @s3(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s3(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0];
-; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1];
-; CHECK-NEXT: st.b64 [%rd1+8], %rd3;
-; CHECK-NEXT: st.b64 [%rd1], %rd2;
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s3_param_1];
+; CHECK-NEXT: st.v2.b32 [%rd1+8], {%r3, %r4};
+; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2};
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 8
ret void
@@ -266,12 +265,13 @@ define void @s3(ptr %p1, <4 x float> %v) {
define void @s4(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s4(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0];
-; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1];
-; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3};
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s4_param_1];
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 16
ret void
diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll
index 666c7a160e1f0..17220340d4b07 100644
--- a/llvm/test/CodeGen/NVPTX/mulwide.ll
+++ b/llvm/test/CodeGen/NVPTX/mulwide.ll
@@ -203,27 +203,35 @@ define i64 @mulwideu32(i32 %a, i32 %b) {
define i64 @mulwideu7(i7 %a, i7 %b) {
; OPT-LABEL: mulwideu7(
; OPT: {
-; OPT-NEXT: .reg .b32 %r<3>;
+; OPT-NEXT: .reg .b32 %r<5>;
; OPT-NEXT: .reg .b64 %rd<2>;
; OPT-EMPTY:
; OPT-NEXT: // %bb.0:
-; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_0];
-; OPT-NEXT: ld.param.b8 %r2, [mulwideu7_param_1];
-; OPT-NEXT: mul.wide.u32 %rd1, %r1, %r2;
+; OPT-NEXT: ld.param.b8 %r1, [mulwideu7_param_1];
+; OPT-NEXT: and.b32 %r2, %r1, 127;
+; OPT-NEXT: ld.param.b8 %r3, [mulwideu7_param_0];
+; OPT-NEXT: and.b32 %r4, %r3, 127;
+; OPT-NEXT: mul.wide.u32 %rd1, %r4, %r2;
; OPT-NEXT: st.param.b64 [func_retval0], %rd1;
; OPT-NEXT: ret;
;
; NOOPT-LABEL: mulwideu7(
; NOOPT: {
-; NOOPT-NEXT: .reg .b16 %rs<3>;
+; NOOPT-NEXT: .reg .b16 %rs<9>;
; NOOPT-NEXT: .reg .b64 %rd<6>;
; NOOPT-EMPTY:
; NOOPT-NEXT: // %bb.0:
-; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu7_param_1];
-; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu7_param_0];
-; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1;
+; NOOPT-NEXT: ld.param.b8 %rs3, [mulwideu7_param_0+1];
+; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8;
+; NOOPT-NEXT: ld.param.b8 %rs5, [mulwideu7_param_0];
+; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5;
+; NOOPT-NEXT: ld.param.b8 %rs6, [mulwideu7_param_1+1];
+; NOOPT-NEXT: shl.b16 %rs7, %rs6, 8;
+; NOOPT-NEXT: ld.param.b8 %rs8, [mulwideu7_param_1];
+; NOOPT-NEXT: or.b16 %rs2, %rs7, %rs8;
+; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs5;
; NOOPT-NEXT: and.b64 %rd2, %rd1, 127;
-; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2;
+; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs8;
; NOOPT-NEXT: and.b64 %rd4, %rd3, 127;
; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4;
; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5;
@@ -242,26 +250,32 @@ define i64 @mulwides7(i7 %a, i7 %b) {
; OPT-EMPTY:
; OPT-NEXT: // %bb.0:
; OPT-NEXT: ld.param.b8 %r1, [mulwides7_param_0];
-; OPT-NEXT: bfe.s32 %r2, %r1, 0, 7;
-; OPT-NEXT: ld.param.b8 %r3, [mulwides7_param_1];
-; OPT-NEXT: bfe.s32 %r4, %r3, 0, 7;
-; OPT-NEXT: mul.wide.s32 %rd1, %r2, %r4;
+; OPT-NEXT: ld.param.b8 %r2, [mulwides7_param_1];
+; OPT-NEXT: bfe.s32 %r3, %r2, 0, 7;
+; OPT-NEXT: bfe.s32 %r4, %r1, 0, 7;
+; OPT-NEXT: mul.wide.s32 %rd1, %r4, %r3;
; OPT-NEXT: st.param.b64 [func_retval0], %rd1;
; OPT-NEXT: ret;
;
; NOOPT-LABEL: mulwides7(
; NOOPT: {
-; NOOPT-NEXT: .reg .b16 %rs<3>;
+; NOOPT-NEXT: .reg .b16 %rs<9>;
; NOOPT-NEXT: .reg .b64 %rd<6>;
; NOOPT-EMPTY:
; NOOPT-NEXT: // %bb.0:
-; NOOPT-NEXT: ld.param.b8 %rs2, [mulwides7_param_1];
-; NOOPT-NEXT: ld.param.b8 %rs1, [mulwides7_param_0];
-; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs1;
-; NOOPT-NEXT: bfe.s64 %rd2, %rd1, 0, 7;
-; NOOPT-NEXT: cvt.u64.u16 %rd3, %rs2;
-; NOOPT-NEXT: bfe.s64 %rd4, %rd3, 0, 7;
-; NOOPT-NEXT: mul.lo.s64 %rd5, %rd2, %rd4;
+; NOOPT-NEXT: ld.param.b8 %rs3, [mulwides7_param_0+1];
+; NOOPT-NEXT: shl.b16 %rs4, %rs3, 8;
+; NOOPT-NEXT: ld.param.b8 %rs5, [mulwides7_param_0];
+; NOOPT-NEXT: or.b16 %rs1, %rs4, %rs5;
+; NOOPT-NEXT: ld.param.b8 %rs6, [mulwides7_param_1];
+; NOOPT-NEXT: cvt.u64.u16 %rd1, %rs6;
+; NOOPT-NEXT: cvt.u64.u16 %rd2, %rs5;
+; NOOPT-NEXT: ld.param.b8 %rs7, [mulwides7_param_1+1];
+; NOOPT-NEXT: shl.b16 %rs8, %rs7, 8;
+; NOOPT-NEXT: or.b16 %rs2, %rs8, %rs6;
+; NOOPT-NEXT: bfe.s64 %rd3, %rd2, 0, 7;
+; NOOPT-NEXT: bfe.s64 %rd4, %rd1, 0, 7;
+; NOOPT-NEXT: mul.lo.s64 %rd5, %rd3, %rd4;
; NOOPT-NEXT: st.param.b64 [func_retval0], %rd5;
; NOOPT-NEXT: ret;
%val0 = sext i7 %a to i64
diff --git a/llvm/test/CodeGen/NVPTX/no-f32x2.ll b/llvm/test/CodeGen/NVPTX/no-f32x2.ll
new file mode 100644
index 0000000000000..b2b909166a0c6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/no-f32x2.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_100 | FileCheck %s --check-prefix=F32X2
+; RUN: llc < %s -mcpu=sm_90 | FileCheck %s --check-prefix=NOF32X2
+; RUN: llc < %s -mcpu=sm_100 -nvptx-no-f32x2 | FileCheck %s --check-prefix=NOF32X2
+
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @test(<2 x float> %a, <2 x float> %b) {
+; F32X2-LABEL: test(
+; F32X2: {
+; F32X2-NEXT: .reg .b64 %rd<4>;
+; F32X2-EMPTY:
+; F32X2-NEXT: // %bb.0:
+; F32X2-NEXT: ld.param.b64 %rd1, [test_param_0];
+; F32X2-NEXT: ld.param.b64 %rd2, [test_param_1];
+; F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; F32X2-NEXT: ret;
+;
+; NOF32X2-LABEL: test(
+; NOF32X2: {
+; NOF32X2-NEXT: .reg .b32 %r<7>;
+; NOF32X2-EMPTY:
+; NOF32X2-NEXT: // %bb.0:
+; NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_param_0];
+; NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_param_1];
+; NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4;
+; NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3;
+; NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; NOF32X2-NEXT: ret;
+ %c = fadd <2 x float> %a, %b
+ ret <2 x float> %c
+}
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index db3fbbc1d2c0f..90c8b921009b6 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -523,8 +523,7 @@ define <9 x half> @test_v9f16(<9 x half> %a) {
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i19(
; CHECK-NEXT: .param .b32 test_i19_param_0
-; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0];
-; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2];
+; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i19_param_0];
; CHECK: .param .b32 param0;
; CHECK: .param .b32 retval0;
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
@@ -540,8 +539,7 @@ define i19 @test_i19(i19 %a) {
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i23(
; CHECK-NEXT: .param .b32 test_i23_param_0
-; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0];
-; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2];
+; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i23_param_0];
; CHECK: .param .b32 param0;
; CHECK: .param .b32 retval0;
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
@@ -557,8 +555,7 @@ define i23 @test_i23(i23 %a) {
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i24(
; CHECK-NEXT: .param .b32 test_i24_param_0
-; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2];
-; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0];
+; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i24_param_0];
; CHECK: .param .b32 param0;
; CHECK: .param .b32 retval0;
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
@@ -678,8 +675,7 @@ define float @test_f32(float %a) {
; CHECK: .func (.param .b64 func_retval0)
; CHECK-LABEL: test_i40(
; CHECK-NEXT: .param .b64 test_i40_param_0
-; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4];
-; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0];
+; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i40_param_0];
; CHECK: .param .b64 param0;
; CHECK: .param .b64 retval0;
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
@@ -695,8 +691,7 @@ define i40 @test_i40(i40 %a) {
; CHECK: .func (.param .b64 func_retval0)
; CHECK-LABEL: test_i47(
; CHECK-NEXT: .param .b64 test_i47_param_0
-; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4];
-; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0];
+; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i47_param_0];
; CHECK: .param .b64 param0;
; CHECK: .param .b64 retval0;
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
@@ -712,8 +707,7 @@ define i47 @test_i47(i47 %a) {
; CHECK: .func (.param .b64 func_retval0)
; CHECK-LABEL: test_i48(
; CHECK-NEXT: .param .b64 test_i48_param_0
-; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4];
-; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0];
+; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i48_param_0];
; CHECK: .param .b64 param0;
; CHECK: .param .b64 retval0;
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
@@ -729,9 +723,7 @@ define i48 @test_i48(i48 %a) {
; CHECK: .func (.param .b64 func_retval0)
; CHECK-LABEL: test_i51(
; CHECK-NEXT: .param .b64 test_i51_param_0
-; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i51_param_0+6];
-; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4];
-; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0];
+; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i51_param_0];
; CHECK: .param .b64 param0;
; CHECK: .param .b64 retval0;
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
@@ -747,9 +739,7 @@ define i51 @test_i51(i51 %a) {
; CHECK: .func (.param .b64 func_retval0)
; CHECK-LABEL: test_i56(
; CHECK-NEXT: .param .b64 test_i56_param_0
-; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i56_param_0+6];
-; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4];
-; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0];
+; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i56_param_0];
; CHECK: .param .b64 param0;
; CHECK: .param .b64 retval0;
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll
index 95258f7a3f360..f56b8eb98077c 100644
--- a/llvm/test/CodeGen/NVPTX/pr126337.ll
+++ b/llvm/test/CodeGen/NVPTX/pr126337.ll
@@ -17,17 +17,16 @@ define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %.preheader15
-; CHECK-NEXT: ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0];
-; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0];
; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000;
; CHECK-NEXT: selp.b16 %rs1, 1, 0, %p1;
; CHECK-NEXT: $L__BB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, 0;
-; CHECK-NEXT: st.b8 [%rd2], %rs1;
+; CHECK-NEXT: mov.b64 %rd1, 0;
+; CHECK-NEXT: st.b8 [%rd1], %rs1;
; CHECK-NEXT: bra.uni $L__BB0_1;
.preheader15:
br label %1
diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
index c78fcddb7ed0f..153d677058d9f 100644
--- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
+++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
@@ -25,11 +25,11 @@ define float @test_gv_float() {
define <2 x float> @test_gv_float2() {
; CHECK-LABEL: test_gv_float2(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.global.nc.b64 %rd1, [gv_float2];
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [gv_float2];
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
; CHECK-NEXT: ret;
%v = load <2 x float>, ptr @gv_float2
ret <2 x float> %v
@@ -38,11 +38,11 @@ define <2 x float> @test_gv_float2() {
define <4 x float> @test_gv_float4() {
; CHECK-LABEL: test_gv_float4(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.global.nc.v2.b64 {%rd1, %rd2}, [gv_float4];
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [gv_float4];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%v = load <4 x float>, ptr @gv_float4
ret <4 x float> %v
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 94c2637ea7509..f286928da4481 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -86,28 +86,46 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
}
define float @reduce_fadd_float(<8 x float> %in) {
-; CHECK-LABEL: reduce_fadd_float(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<17>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3;
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0];
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000;
-; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8;
-; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5;
-; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6;
-; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3;
-; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4;
-; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1;
-; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r16;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_fadd_float(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<17>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0];
+; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, 0f00000000;
+; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r2;
+; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r3;
+; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4;
+; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r5;
+; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r6;
+; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r7;
+; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, %r8;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd_float(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<17>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3;
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-SM100-NEXT: add.rn.f32 %r9, %r7, 0f00000000;
+; CHECK-SM100-NEXT: add.rn.f32 %r10, %r9, %r8;
+; CHECK-SM100-NEXT: add.rn.f32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, %r6;
+; CHECK-SM100-NEXT: add.rn.f32 %r13, %r12, %r3;
+; CHECK-SM100-NEXT: add.rn.f32 %r14, %r13, %r4;
+; CHECK-SM100-NEXT: add.rn.f32 %r15, %r14, %r1;
+; CHECK-SM100-NEXT: add.rn.f32 %r16, %r15, %r2;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r16;
+; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
ret float %res
}
@@ -116,20 +134,15 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fadd_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<17>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: add.rn.f32 %r5, %r4, %r2;
-; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1;
-; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r7;
-; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r5;
-; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r1;
-; CHECK-SM80-NEXT: add.rn.f32 %r13, %r8, %r6;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-SM80-NEXT: add.rn.f32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: add.rn.f32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: add.rn.f32 %r13, %r1, %r5;
; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r12;
; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r11;
; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000;
@@ -272,27 +285,44 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
}
define float @reduce_fmul_float(<8 x float> %in) {
-; CHECK-LABEL: reduce_fmul_float(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16];
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3;
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0];
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8;
-; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5;
-; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6;
-; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3;
-; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4;
-; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1;
-; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_fmul_float(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0];
+; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r2;
+; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r3;
+; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r4;
+; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r5;
+; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r6;
+; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r7;
+; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul_float(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3;
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-SM100-NEXT: mul.rn.f32 %r9, %r7, %r8;
+; CHECK-SM100-NEXT: mul.rn.f32 %r10, %r9, %r5;
+; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r10, %r6;
+; CHECK-SM100-NEXT: mul.rn.f32 %r12, %r11, %r3;
+; CHECK-SM100-NEXT: mul.rn.f32 %r13, %r12, %r4;
+; CHECK-SM100-NEXT: mul.rn.f32 %r14, %r13, %r1;
+; CHECK-SM100-NEXT: mul.rn.f32 %r15, %r14, %r2;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
ret float %res
}
@@ -301,20 +331,15 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmul_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r4, %r2;
-; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1;
-; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r7;
-; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r5;
-; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r1;
-; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r8, %r6;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r1, %r5;
; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r12;
; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r11;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
@@ -495,15 +520,10 @@ define float @reduce_fmax_float(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmax_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
@@ -540,15 +560,10 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmax_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
@@ -620,15 +635,10 @@ define float @reduce_fmax_float_nnan(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmax_float_nnan(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_nnan_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_nnan_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_nnan_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_param_0];
; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
@@ -809,15 +819,10 @@ define float @reduce_fmin_float(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmin_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
@@ -854,15 +859,10 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmin_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
@@ -934,15 +934,10 @@ define float @reduce_fmin_float_nnan(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmin_float_nnan(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_nnan_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_nnan_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_nnan_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_param_0];
; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
@@ -1078,15 +1073,10 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmaximum_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9;
@@ -1123,15 +1113,10 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9;
@@ -1267,15 +1252,10 @@ define float @reduce_fminimum_float(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fminimum_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9;
@@ -1312,15 +1292,10 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b32 %r<16>;
-; CHECK-SM80-NEXT: .reg .b64 %rd<5>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2;
-; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT: mov.b64 {%r5, %r6}, %rd3;
-; CHECK-SM80-NEXT: mov.b64 {%r7, %r8}, %rd4;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8;
; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6;
; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9;
diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index 29939e323b4b1..3c424c9318375 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -7,17 +7,17 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define <16 x float> @test_v16f32(<16 x float> %a) {
; CHECK-LABEL: test_v16f32(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b32 %r<17>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0];
-; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16];
-; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32];
-; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48];
-; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8};
-; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6};
-; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16f32_param_0];
+; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16f32_param_0+16];
+; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [test_v16f32_param_0+32];
+; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [test_v16f32_param_0+48];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+48], {%r13, %r14, %r15, %r16};
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+32], {%r9, %r10, %r11, %r12};
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
ret <16 x float> %a
}
@@ -25,13 +25,13 @@ define <16 x float> @test_v16f32(<16 x float> %a) {
define <8 x float> @test_v8f32(<8 x float> %a) {
; CHECK-LABEL: test_v8f32(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0];
-; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16];
-; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v8f32_param_0];
+; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v8f32_param_0+16];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
ret <8 x float> %a
}
@@ -39,11 +39,11 @@ define <8 x float> @test_v8f32(<8 x float> %a) {
define <4 x float> @test_v4f32(<4 x float> %a) {
; CHECK-LABEL: test_v4f32(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0];
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v4f32_param_0];
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
ret <4 x float> %a
}
@@ -51,11 +51,11 @@ define <4 x float> @test_v4f32(<4 x float> %a) {
define <2 x float> @test_v2f32(<2 x float> %a) {
; CHECK-LABEL: test_v2f32(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0];
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0];
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
; CHECK-NEXT: ret;
ret <2 x float> %a
}
@@ -64,14 +64,13 @@ define <2 x float> @test_v2f32(<2 x float> %a) {
define <3 x float> @test_v3f32(<3 x float> %a) {
; CHECK-LABEL: test_v3f32(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8];
-; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8];
+; CHECK-NEXT: st.param.b32 [func_retval0+8], %r3;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
; CHECK-NEXT: ret;
ret <3 x float> %a
}
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 6f0dff78d5569..ccac7ff8e6472 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -206,18 +206,18 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst
; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_global_a16_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_global_a16_param_1];
; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT: cvt.f32.f16 %r5, %rs2;
-; CHECK-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT: cvt.f32.f16 %r7, %rs4;
-; CHECK-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4;
-; CHECK-NEXT: cvt.f32.f16 %r9, %rs6;
-; CHECK-NEXT: cvt.f32.f16 %r10, %rs5;
-; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT: cvt.f32.f16 %r11, %rs8;
-; CHECK-NEXT: cvt.f32.f16 %r12, %rs7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; CHECK-NEXT: cvt.f32.f16 %r5, %rs8;
+; CHECK-NEXT: cvt.f32.f16 %r6, %rs7;
+; CHECK-NEXT: cvt.f32.f16 %r7, %rs6;
+; CHECK-NEXT: cvt.f32.f16 %r8, %rs5;
+; CHECK-NEXT: cvt.f32.f16 %r9, %rs4;
+; CHECK-NEXT: cvt.f32.f16 %r10, %rs3;
+; CHECK-NEXT: cvt.f32.f16 %r11, %rs2;
+; CHECK-NEXT: cvt.f32.f16 %r12, %rs1;
; CHECK-NEXT: st.global.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9};
; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT: ret;
@@ -270,18 +270,18 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia
; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_generic_a16_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_generic_a16_param_1];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT: cvt.f32.f16 %r5, %rs2;
-; CHECK-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT: cvt.f32.f16 %r7, %rs4;
-; CHECK-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4;
-; CHECK-NEXT: cvt.f32.f16 %r9, %rs6;
-; CHECK-NEXT: cvt.f32.f16 %r10, %rs5;
-; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT: cvt.f32.f16 %r11, %rs8;
-; CHECK-NEXT: cvt.f32.f16 %r12, %rs7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; CHECK-NEXT: cvt.f32.f16 %r5, %rs8;
+; CHECK-NEXT: cvt.f32.f16 %r6, %rs7;
+; CHECK-NEXT: cvt.f32.f16 %r7, %rs6;
+; CHECK-NEXT: cvt.f32.f16 %r8, %rs5;
+; CHECK-NEXT: cvt.f32.f16 %r9, %rs4;
+; CHECK-NEXT: cvt.f32.f16 %r10, %rs3;
+; CHECK-NEXT: cvt.f32.f16 %r11, %rs2;
+; CHECK-NEXT: cvt.f32.f16 %r12, %rs1;
; CHECK-NEXT: st.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9};
; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index d07c740d32a72..b9bb417aa2c37 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -5,12 +5,13 @@
define void @foo1(<2 x float> %val, ptr %ptr) {
; CHECK-LABEL: foo1(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [foo1_param_1];
-; CHECK-NEXT: st.b64 [%rd2], %rd1;
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo1_param_0];
+; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_1];
+; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2};
; CHECK-NEXT: ret;
store <2 x float> %val, ptr %ptr
ret void
@@ -19,12 +20,13 @@ define void @foo1(<2 x float> %val, ptr %ptr) {
define void @foo2(<4 x float> %val, ptr %ptr) {
; CHECK-LABEL: foo2(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0];
-; CHECK-NEXT: ld.param.b64 %rd3, [foo2_param_1];
-; CHECK-NEXT: st.v2.b64 [%rd3], {%rd1, %rd2};
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo2_param_0];
+; CHECK-NEXT: ld.param.b64 %rd1, [foo2_param_1];
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
store <4 x float> %val, ptr %ptr
ret void
More information about the llvm-commits
mailing list