[llvm] [NVPTX] support packed f32 instructions for sm_100+ (PR #126337)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 7 21:04:42 PST 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/126337
>From 6b8fc8d4dc268c8324fb96a5a75a3f8c1d0c3982 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 30 Jan 2025 17:36:09 -0800
Subject: [PATCH 01/22] support f32x2 instructions for Blackwell
This is a rewrite of previous work that legalized v2f32 into an i64
register. Here we keep the type non-legal, and selectively legalize it
for certain operations (FADD, FSUB, FMUL, FMA). Additional operations
are handled to improve codegen quality.
---
.../include/llvm/Target/TargetSelectionDAG.td | 4 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 30 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 136 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 7 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 12 +-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 22 +
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 4 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 3 +
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 1111 +++++++++++++++++
10 files changed, 1308 insertions(+), 22 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 42a5fbec95174e1..1e432f55ad4b6cc 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -818,6 +818,10 @@ def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1,
def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
[]>;
+def build_pair : SDNode<"ISD::BUILD_PAIR", SDTypeProfile<1, 2,
+ [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>, []>;
+
+
// vector_extract/vector_insert are deprecated. extractelt/insertelt
// are preferred.
def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ec654e0f3f200f4..991afa0d5ec9a81 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -121,6 +121,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
case NVPTXISD::SETP_BF16X2:
SelectSETP_BF16X2(N);
return;
+ case NVPTXISD::FADD_F32X2:
+ case NVPTXISD::FSUB_F32X2:
+ case NVPTXISD::FMUL_F32X2:
+ case NVPTXISD::FMA_F32X2:
+ SelectF32X2Op(N);
+ return;
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
if (tryLoadVector(N))
@@ -295,6 +301,30 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
return true;
}
+void NVPTXDAGToDAGISel::SelectF32X2Op(SDNode *N) {
+ unsigned Opcode;
+ switch (N->getOpcode()) {
+ case NVPTXISD::FADD_F32X2:
+ Opcode = NVPTX::FADD_F32X2;
+ break;
+ case NVPTXISD::FSUB_F32X2:
+ Opcode = NVPTX::FSUB_F32X2;
+ break;
+ case NVPTXISD::FMUL_F32X2:
+ Opcode = NVPTX::FMUL_F32X2;
+ break;
+ case NVPTXISD::FMA_F32X2:
+ Opcode = NVPTX::FMA_F32X2;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ }
+ SDLoc DL(N);
+ SmallVector<SDValue> NewOps(N->ops());
+ SDNode *NewNode = CurDAG->getMachineNode(Opcode, DL, MVT::i64, NewOps);
+ ReplaceNode(N, NewNode);
+}
+
// Find all instances of extract_vector_elt that use this v2f16 vector
// and coalesce them into a scattering move instruction.
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 8dc6bc86c68281f..db23bcc096a6afb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -88,6 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryConstantFP(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
+ void SelectF32X2Op(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 58ad92a8934a66d..306ab952b2af757 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -866,6 +866,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
// (would be) Library functions.
+ if (STI.hasF32x2Instructions()) {
+ // Handle custom lowering for: v2f32 = OP v2f32, v2f32
+ for (const auto &Op : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FMA})
+ setOperationAction(Op, MVT::v2f32, Custom);
+ // Handle custom lowering for: i64 = bitcast v2f32
+ setOperationAction(ISD::BITCAST, MVT::v2f32, Custom);
+ }
+
// These map to conversion instructions for scalar FP types.
for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
ISD::FROUNDEVEN, ISD::FTRUNC}) {
@@ -1066,6 +1074,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::STACKSAVE)
MAKE_CASE(NVPTXISD::SETP_F16X2)
MAKE_CASE(NVPTXISD::SETP_BF16X2)
+ MAKE_CASE(NVPTXISD::FADD_F32X2)
+ MAKE_CASE(NVPTXISD::FSUB_F32X2)
+ MAKE_CASE(NVPTXISD::FMUL_F32X2)
+ MAKE_CASE(NVPTXISD::FMA_F32X2)
MAKE_CASE(NVPTXISD::Dummy)
MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
@@ -2099,24 +2111,58 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
EVT FromVT = Op->getOperand(0)->getValueType(0);
- if (FromVT != MVT::v2i8) {
- return Op;
- }
-
- // Pack vector elements into i16 and bitcast to final type
- SDLoc DL(Op);
- SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
- SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
- SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
- SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
- SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
- SDValue AsInt = DAG.getNode(
- ISD::OR, DL, MVT::i16,
- {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
EVT ToVT = Op->getValueType(0);
- return MaybeBitcast(DAG, DL, ToVT, AsInt);
+ SDLoc DL(Op);
+
+ if (FromVT == MVT::v2i8) {
+ // Pack vector elements into i16 and bitcast to final type
+ SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
+ Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
+ SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
+ Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
+ SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
+ SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
+ SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
+ SDValue AsInt = DAG.getNode(
+ ISD::OR, DL, MVT::i16,
+ {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
+ EVT ToVT = Op->getValueType(0);
+ return MaybeBitcast(DAG, DL, ToVT, AsInt);
+ }
+
+ if (FromVT == MVT::v2f32) {
+ assert(ToVT == MVT::i64);
+
+ // A bitcast to i64 from v2f32.
+ // See if we can legalize the operand.
+ const SDValue &Operand = Op->getOperand(0);
+ if (Operand.getOpcode() == ISD::BUILD_VECTOR) {
+ const SDValue &BVOp0 = Operand.getOperand(0);
+ const SDValue &BVOp1 = Operand.getOperand(1);
+
+ auto CastToAPInt = [](SDValue Op) -> APInt {
+ if (Op->isUndef())
+ return APInt(64, 0); // undef values default to 0
+ return cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt().zext(
+ 64);
+ };
+
+ if ((BVOp0->isUndef() || isa<ConstantFPSDNode>(BVOp0)) &&
+ (BVOp1->isUndef() || isa<ConstantFPSDNode>(BVOp1))) {
+ // cast two constants
+ APInt Value(64, 0);
+ Value = CastToAPInt(BVOp0) | CastToAPInt(BVOp1).shl(32);
+ SDValue Const = DAG.getConstant(Value, DL, MVT::i64);
+ return DAG.getBitcast(ToVT, Const);
+ }
+
+ // otherwise build an i64
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+ DAG.getBitcast(MVT::i32, BVOp0),
+ DAG.getBitcast(MVT::i32, BVOp1));
+ }
+ }
+ return Op;
}
// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
@@ -3055,6 +3101,13 @@ bool NVPTXTargetLowering::splitValueIntoRegisterParts(
return false;
}
+const TargetRegisterClass *
+NVPTXTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+ if (VT == MVT::v2f32)
+ return &NVPTX::Int64RegsRegClass;
+ return TargetLowering::getRegClassFor(VT, isDivergent);
+}
+
// This creates target external symbol for a function parameter.
// Name of the symbol is composed from its index and the function name.
// Negative index corresponds to special parameter (unsized array) used for
@@ -5055,10 +5108,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
IsPTXVectorType(VectorVT.getSimpleVT()))
return SDValue(); // Native vector loads already combine nicely w/
// extract_vector_elt.
- // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
+ // Don't mess with singletons or v2*16, v4i8, v8i8, or v2f32 types, we already
// handle them OK.
if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
- VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
+ VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8 || VectorVT == MVT::v2f32)
return SDValue();
// Don't mess with undef values as sra may be simplified to 0, not undef.
@@ -5478,6 +5531,45 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
Results.push_back(NewValue.getValue(3));
}
+static void ReplaceF32x2Op(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results,
+ bool UseFTZ) {
+ SDLoc DL(N);
+ EVT OldResultTy = N->getValueType(0); // <2 x float>
+ assert(OldResultTy == MVT::v2f32 && "Unexpected result type for F32x2 op!");
+
+ SmallVector<SDValue> NewOps;
+
+ // whether we use FTZ (TODO)
+
+ // replace with NVPTX F32x2 op:
+ unsigned Opcode;
+ switch (N->getOpcode()) {
+ case ISD::FADD:
+ Opcode = NVPTXISD::FADD_F32X2;
+ break;
+ case ISD::FSUB:
+ Opcode = NVPTXISD::FSUB_F32X2;
+ break;
+ case ISD::FMUL:
+ Opcode = NVPTXISD::FMUL_F32X2;
+ break;
+ case ISD::FMA:
+ Opcode = NVPTXISD::FMA_F32X2;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+
+ // bitcast operands: <2 x float> -> i64
+ for (const SDValue &Op : N->ops())
+ NewOps.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op));
+
+ // cast i64 result of new op back to <2 x float>
+ SDValue NewValue = DAG.getNode(Opcode, DL, MVT::i64, NewOps);
+ Results.push_back(DAG.getBitcast(OldResultTy, NewValue));
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -5495,6 +5587,12 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case ISD::CopyFromReg:
ReplaceCopyFromReg_128(N, DAG, Results);
return;
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FMA:
+ ReplaceF32x2Op(N, DAG, Results, useF32FTZ(DAG.getMachineFunction()));
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5adf69d621552f3..f41902fbcaf99a9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -55,6 +55,10 @@ enum NodeType : unsigned {
FSHR_CLAMP,
MUL_WIDE_SIGNED,
MUL_WIDE_UNSIGNED,
+ FADD_F32X2,
+ FMUL_F32X2,
+ FSUB_F32X2,
+ FMA_F32X2,
SETP_F16X2,
SETP_BF16X2,
BFE,
@@ -311,6 +315,9 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue *Parts, unsigned NumParts, MVT PartVT,
std::optional<CallingConv::ID> CC) const override;
+ const TargetRegisterClass *getRegClassFor(MVT VT,
+ bool isDivergent) const override;
+
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 7d9697e40e6aba7..269573cd3d9ae1d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -165,6 +165,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
+def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
def True : Predicate<"true">;
def False : Predicate<"false">;
@@ -2638,13 +2639,13 @@ class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
NVPTXInst<(outs), (ins regclass:$a), "$a",
[(LastCallArg (i32 0), vt:$a)]>;
-def CallArgI64 : CallArgInst<Int64Regs>;
+def CallArgI64 : CallArgInstVT<Int64Regs, i64>;
def CallArgI32 : CallArgInstVT<Int32Regs, i32>;
def CallArgI16 : CallArgInstVT<Int16Regs, i16>;
def CallArgF64 : CallArgInst<Float64Regs>;
def CallArgF32 : CallArgInst<Float32Regs>;
-def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI64 : LastCallArgInstVT<Int64Regs, i64>;
def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
def LastCallArgF64 : LastCallArgInst<Float64Regs>;
@@ -3371,6 +3372,9 @@ let hasSideEffects = false in {
def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
(ins Float32Regs:$s1, Float32Regs:$s2),
"mov.b64 \t$d, {{$s1, $s2}};", []>;
+ def V2F32toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Float32Regs:$s1, Float32Regs:$s2),
+ "mov.b64 \t$d, {{$s1, $s2}};", []>;
// unpack a larger int register to a set of smaller int registers
def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
@@ -3435,6 +3439,10 @@ def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
(V2I16toI32 $a, $b)>;
def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
(V2I16toI32 $a, $b)>;
+def : Pat<(v2f32 (build_vector f32:$a, f32:$b)),
+ (V2F32toI64 $a, $b)>;
+def : Pat<(i64 (build_pair i32:$a, i32:$b)),
+ (V2I32toI64 $a, $b)>;
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
(CVT_u32_u16 $a, CvtNONE)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5331f36ad09997f..2402265368f4aed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1581,6 +1581,28 @@ def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+// F32x2 ops (sm_100+)
+
+def FADD_F32X2 : NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ "add.rn.f32x2 \t$res, $a, $b;", []>,
+ Requires<[hasF32x2Instructions]>;
+
+def FSUB_F32X2 : NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ "sub.rn.f32x2 \t$res, $a, $b;", []>,
+ Requires<[hasF32x2Instructions]>;
+
+def FMUL_F32X2 : NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ "mul.rn.f32x2 \t$res, $a, $b;", []>,
+ Requires<[hasF32x2Instructions]>;
+
+def FMA_F32X2 : NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+ "fma.rn.f32x2 \t$res, $a, $b;", []>,
+ Requires<[hasF32x2Instructions]>;
+
//
// BFIND
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 2011f0f7e328ff4..7630eefe211827f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -62,7 +62,9 @@ def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4)
def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32,
(add (sequence "R%u", 0, 4),
VRFrame32, VRFrameLocal32)>;
-def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
+def Int64Regs : NVPTXRegClass<[i64, v2f32], 64,
+ (add (sequence "RL%u", 0, 4),
+ VRFrame64, VRFrameLocal64)>;
// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 851c9152e4cb8ff..2292841a3c66df3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -97,6 +97,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+
// Tcgen05 instructions in Blackwell family
bool hasTcgen05Instructions() const {
bool HasTcgen05 = false;
@@ -112,6 +113,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
return HasTcgen05 && PTXVersion >= 86;
}
+ bool hasF32x2Instructions() const { return SmVersion >= 100; }
+
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
new file mode 100644
index 000000000000000..984598dd3fb1395
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -0,0 +1,1111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Full FP32x2 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
+ ret <2 x float> <float 1.0, float 2.0>
+}
+
+define float @test_extract_0(<2 x float> %a) #0 {
+; CHECK-LABEL: test_extract_0(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%f1, tmp}, %rd1; }
+; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT: ret;
+ %e = extractelement <2 x float> %a, i32 0
+ ret float %e
+}
+
+define float @test_extract_1(<2 x float> %a) #0 {
+; CHECK-LABEL: test_extract_1(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %f1}, %rd1; }
+; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT: ret;
+ %e = extractelement <2 x float> %a, i32 1
+ ret float %e
+}
+
+define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+; CHECK-LABEL: test_extract_i(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .f32 %f<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_extract_i_param_1];
+; CHECK-NEXT: setp.eq.s64 %p1, %rd2, 0;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: selp.f32 %f3, %f1, %f2, %p1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT: ret;
+ %e = extractelement <2 x float> %a, i64 %idx
+ ret float %e
+}
+
+define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fadd(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fadd_param_1];
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ret;
+ %r = fadd <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-NEXT: ret;
+ %r = fadd <2 x float> <float 1.0, float 2.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-NEXT: ret;
+ %r = fadd <2 x float> %a, <float 1.0, float 2.0>
+ ret <2 x float> %r
+}
+
+define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_fadd_v4(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<13>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
+; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
+; CHECK-NEXT: mov.b64 %rd4, {%f7, %f8};
+; CHECK-NEXT: mov.b64 %rd3, {%f5, %f6};
+; CHECK-NEXT: add.rn.f32x2 %rd5, %rd1, %rd3;
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd2, %rd4;
+; CHECK-NEXT: mov.b64 {%f9, %f10}, %rd6;
+; CHECK-NEXT: mov.b64 {%f11, %f12}, %rd5;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-NEXT: ret;
+ %r = fadd <4 x float> %a, %b
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
+; CHECK-LABEL: test_fadd_imm_0_v4(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<13>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
+; CHECK-NEXT: mov.f32 %f5, 0f40000000;
+; CHECK-NEXT: mov.f32 %f6, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-NEXT: add.rn.f32x2 %rd4, %rd1, %rd3;
+; CHECK-NEXT: mov.f32 %f7, 0f40800000;
+; CHECK-NEXT: mov.f32 %f8, 0f40400000;
+; CHECK-NEXT: mov.b64 %rd5, {%f8, %f7};
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
+; CHECK-NEXT: mov.b64 {%f9, %f10}, %rd6;
+; CHECK-NEXT: mov.b64 {%f11, %f12}, %rd4;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-NEXT: ret;
+ %r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
+; CHECK-LABEL: test_fadd_imm_1_v4(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<13>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
+; CHECK-NEXT: mov.f32 %f5, 0f40000000;
+; CHECK-NEXT: mov.f32 %f6, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-NEXT: add.rn.f32x2 %rd4, %rd1, %rd3;
+; CHECK-NEXT: mov.f32 %f7, 0f40800000;
+; CHECK-NEXT: mov.f32 %f8, 0f40400000;
+; CHECK-NEXT: mov.b64 %rd5, {%f8, %f7};
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
+; CHECK-NEXT: mov.b64 {%f9, %f10}, %rd6;
+; CHECK-NEXT: mov.b64 {%f11, %f12}, %rd4;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-NEXT: ret;
+ %r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+ ret <4 x float> %r
+}
+
+define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fsub(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fsub_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ret;
+ %r = fsub <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fneg(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fneg(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<4>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-NEXT: mov.b64 {%f2, %f3}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f3};
+; CHECK-NEXT: ret;
+ %r = fsub <2 x float> <float 0.0, float 0.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fmul(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fmul_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fmul_param_1];
+; CHECK-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ret;
+ %r = fmul <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fdiv(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-NEXT: div.rn.f32 %f6, %f3, %f1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fdiv <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_frem(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<15>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-NEXT: cvt.rzi.f32.f32 %f6, %f5;
+; CHECK-NEXT: mul.f32 %f7, %f6, %f2;
+; CHECK-NEXT: sub.f32 %f8, %f4, %f7;
+; CHECK-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-NEXT: div.rn.f32 %f10, %f3, %f1;
+; CHECK-NEXT: cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-NEXT: mul.f32 %f12, %f11, %f1;
+; CHECK-NEXT: sub.f32 %f13, %f3, %f12;
+; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
+; CHECK-NEXT: ret;
+ %r = frem <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v2f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-NEXT: ld.b64 %rd3, [%rd1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
+; CHECK-NEXT: ret;
+ %t1 = load <2 x float>, ptr %a
+ store <2 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v3f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
+; CHECK-NEXT: ld.u64 %rd3, [%rd1];
+; CHECK-NEXT: ld.f32 %f1, [%rd1+8];
+; CHECK-NEXT: st.f32 [%rd2+8], %f1;
+; CHECK-NEXT: st.u64 [%rd2], %rd3;
+; CHECK-NEXT: ret;
+ %t1 = load <3 x float>, ptr %a
+ store <3 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v4f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT: ret;
+ %t1 = load <4 x float>, ptr %a
+ store <4 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v8f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
+; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT: ret;
+ %t1 = load <8 x float>, ptr %a
+ store <8 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
+
+define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_call(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1];
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd2;
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-NEXT: ret;
+ %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_call_flipped(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0];
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-NEXT: ret;
+ %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: { // callseq 2, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-NEXT: } // callseq 2
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-NEXT: ret;
+ %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1];
+; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ret;
+ %r = select i1 %c, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
+; CHECK-LABEL: test_select_cc(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<11>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1];
+; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2];
+; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd4;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-NEXT: mov.b64 {%f5, %f6}, %rd2;
+; CHECK-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-NEXT: selp.f32 %f9, %f8, %f6, %p2;
+; CHECK-NEXT: selp.f32 %f10, %f7, %f5, %p1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
+; CHECK-NEXT: ret;
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
+; CHECK-LABEL: test_select_cc_f64_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f64_f32_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f64_f32_param_3];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
+; CHECK-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
+; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
+; CHECK-NEXT: ret;
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
+ ret <2 x double> %r
+}
+
+define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
+; CHECK-LABEL: test_select_cc_f32_f64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
+; CHECK-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: selp.f32 %f5, %f4, %f2, %p2;
+; CHECK-NEXT: selp.f32 %f6, %f3, %f1, %p1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-NEXT: ret;
+ %cc = fcmp une <2 x double> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.neu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.neu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp une <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.equ.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.equ.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ueq <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.gtu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.gtu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ugt <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.geu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.geu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp uge <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.ltu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.ltu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ult <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.leu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.leu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ule <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.nan.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.nan.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp uno <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.ne.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.ne.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp one <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.eq.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.eq.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp oeq <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.gt.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.gt.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ogt <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.ge.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.ge.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp oge <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.lt.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.lt.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp olt <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.le.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.le.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ole <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.num.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.num.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
+ %r = fcmp ord <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.s32.f32 %r1, %f2;
+; CHECK-NEXT: cvt.rzi.s32.f32 %r2, %f1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT: ret;
+ %r = fptosi <2 x float> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
+; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ret;
+ %r = fptosi <2 x float> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.u32.f32 %r1, %f2;
+; CHECK-NEXT: cvt.rzi.u32.f32 %r2, %f1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT: ret;
+ %r = fptoui <2 x float> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
+; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ret;
+ %r = fptoui <2 x float> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
+ %r = uitofp <2 x i32> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-NEXT: cvt.rn.f32.u64 %f1, %rd2;
+; CHECK-NEXT: cvt.rn.f32.u64 %f2, %rd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
+ %r = uitofp <2 x i64> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2;
+; CHECK-NEXT: cvt.rn.f32.s32 %f2, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
+ %r = sitofp <2 x i32> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-NEXT: cvt.rn.f32.s64 %f1, %rd2;
+; CHECK-NEXT: cvt.rn.f32.s64 %f2, %rd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
+ %r = sitofp <2 x i64> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-NEXT: ret;
+ %c = uitofp <2 x i32> %a to <2 x float>
+ %r = fadd <2 x float> %b, %c
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NEXT: cvt.rn.f32.f64 %f1, %fd2;
+; CHECK-NEXT: cvt.rn.f32.f64 %f2, %fd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
+ %r = fptrunc <2 x double> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
+ %r = fpext <2 x float> %a to <2 x double>
+ ret <2 x double> %r
+}
+
+define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
+ %r = bitcast <2 x float> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+ %r = bitcast <2 x i32> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
+ %r = bitcast double %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
+ %r = bitcast <2 x float> %a to double
+ ret double %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
>From 03a55abadaaf120f5ee670b939c25c5ce74cf2d4 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 01:21:33 -0800
Subject: [PATCH 02/22] write F32X2 result into two i32 registers
Allows better codegen as each register can be forwarded through
subsequent EXTRACT_VECTOR_ELT nodes.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 15 ++++++++++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 33 +++++++++++++++++++--
3 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 991afa0d5ec9a81..6a604227326ae53 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -196,6 +196,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
SelectI128toV2I64(N);
return;
}
+ if (N->getOperand(1).getValueType() == MVT::i64 && N->getNumValues() == 3) {
+ SelectI64ToV2I32(N);
+ return;
+ }
break;
}
case ISD::FADD:
@@ -2795,6 +2799,17 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
ReplaceNode(N, Mov);
}
+void NVPTXDAGToDAGISel::SelectI64ToV2I32(SDNode *N) {
+ SDValue Ch = N->getOperand(0);
+ SDValue Src = N->getOperand(1);
+ SDLoc DL(N);
+
+ SDNode *Mov = CurDAG->getMachineNode(NVPTX::I64toV2I32, DL,
+ {MVT::i32, MVT::i32, Ch.getValueType()},
+ {Src, Ch});
+ ReplaceNode(N, Mov);
+}
+
/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
/// conversion from \p SrcTy to \p DestTy.
unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index db23bcc096a6afb..6792ccc2b168bd3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -92,6 +92,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
+ void SelectI64ToV2I32(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
void SelectCpAsyncBulkS2G(SDNode *N);
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 306ab952b2af757..d793e2a0c3a4861 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -872,6 +872,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(Op, MVT::v2f32, Custom);
// Handle custom lowering for: i64 = bitcast v2f32
setOperationAction(ISD::BITCAST, MVT::v2f32, Custom);
+ // Handle custom lowering for: f32 = extract_vector_elt v2f32
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
}
// These map to conversion instructions for scalar FP types.
@@ -2253,6 +2255,20 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
}
+ if (VectorVT == MVT::v2f32) {
+ if (Vector.getOpcode() == ISD::BITCAST) {
+ // peek through v2f32 = bitcast (i64 = build_pair (i32 A, i32 B))
+ // where A:i32, B:i32 = CopyFromReg (i64 = F32X2 Operation ...)
+ SDValue Pair = Vector.getOperand(0);
+ assert(Pair.getOpcode() == ISD::BUILD_PAIR);
+ return DAG.getNode(
+ ISD::BITCAST, DL, Op.getValueType(),
+ Pair.getOperand(cast<ConstantSDNode>(Index)->getZExtValue()));
+ }
+ if (Vector.getOpcode() == ISD::BUILD_VECTOR)
+ return Vector.getOperand(cast<ConstantSDNode>(Index)->getZExtValue());
+ }
+
// Constant index will be matched by tablegen.
if (isa<ConstantSDNode>(Index.getNode()))
return Op;
@@ -5565,9 +5581,22 @@ static void ReplaceF32x2Op(SDNode *N, SelectionDAG &DAG,
for (const SDValue &Op : N->ops())
NewOps.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op));
- // cast i64 result of new op back to <2 x float>
+ SDValue Chain = DAG.getEntryNode();
+
+ // break i64 result into two i32 registers for later instructions that may
+ // access element #0 or #1. otherwise, this code will be eliminated
SDValue NewValue = DAG.getNode(Opcode, DL, MVT::i64, NewOps);
- Results.push_back(DAG.getBitcast(OldResultTy, NewValue));
+ MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
+ Register DestReg = RegInfo.createVirtualRegister(
+ DAG.getTargetLoweringInfo().getRegClassFor(MVT::i64));
+ SDValue RegCopy = DAG.getCopyToReg(Chain, DL, DestReg, NewValue);
+ SDValue Explode = DAG.getNode(ISD::CopyFromReg, DL,
+ {MVT::i32, MVT::i32, Chain.getValueType()},
+ {RegCopy, DAG.getRegister(DestReg, MVT::i64)});
+ // cast i64 result of new op back to <2 x float>
+ Results.push_back(DAG.getBitcast(
+ OldResultTy, DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+ {Explode.getValue(0), Explode.getValue(1)})));
}
void NVPTXTargetLowering::ReplaceNodeResults(
>From af67f9f5fc5f4db42bbf0cec9b44f4530302ca2c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 01:36:59 -0800
Subject: [PATCH 03/22] handle extracts with non-constant indices
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d793e2a0c3a4861..39527a8c579b605 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2256,17 +2256,24 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
}
if (VectorVT == MVT::v2f32) {
+ auto GetOperand = [&DAG, &DL](SDValue Op, SDValue Index) {
+ if (const auto *ConstIdx = dyn_cast<ConstantSDNode>(Index))
+ return Op.getOperand(ConstIdx->getZExtValue());
+ SDValue E0 = Op.getOperand(0);
+ SDValue E1 = Op.getOperand(1);
+ return DAG.getSelectCC(DL, Index, DAG.getIntPtrConstant(0, DL), E0, E1,
+ ISD::CondCode::SETEQ);
+ };
if (Vector.getOpcode() == ISD::BITCAST) {
// peek through v2f32 = bitcast (i64 = build_pair (i32 A, i32 B))
// where A:i32, B:i32 = CopyFromReg (i64 = F32X2 Operation ...)
SDValue Pair = Vector.getOperand(0);
assert(Pair.getOpcode() == ISD::BUILD_PAIR);
- return DAG.getNode(
- ISD::BITCAST, DL, Op.getValueType(),
- Pair.getOperand(cast<ConstantSDNode>(Index)->getZExtValue()));
+ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(),
+ GetOperand(Pair, Index));
}
if (Vector.getOpcode() == ISD::BUILD_VECTOR)
- return Vector.getOperand(cast<ConstantSDNode>(Index)->getZExtValue());
+ return GetOperand(Vector, Index);
}
// Constant index will be matched by tablegen.
>From e0b32d07804ba19d41de492cf58af8e251de9e1f Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 13:45:58 -0800
Subject: [PATCH 04/22] let SelectionDAG expand v2f32 operands of
extract_vector_elt
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 39527a8c579b605..4599fb6f88c7ee3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2274,6 +2274,9 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
}
if (Vector.getOpcode() == ISD::BUILD_VECTOR)
return GetOperand(Vector, Index);
+
+ // Otherwise, let SelectionDAG expand the operand.
+ return SDValue();
}
// Constant index will be matched by tablegen.
>From 29411df8855410f6dca4b95705104f689c94f544 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 14:06:27 -0800
Subject: [PATCH 05/22] let SelectionDAG expand v2f32 operands of bitcast
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4599fb6f88c7ee3..96358afd25c8490 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2163,6 +2163,9 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
DAG.getBitcast(MVT::i32, BVOp0),
DAG.getBitcast(MVT::i32, BVOp1));
}
+
+ // Otherwise, let SelectionDAG expand the operand
+ return SDValue();
}
return Op;
}
>From 6227d6ee2dce646e40c0f074bbdc0ff0b18c8ab3 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 14:30:58 -0800
Subject: [PATCH 06/22] convert assertions to conditionals for bitcasts
Handles other types of bitcasts. With this, all test cases in the
f32x2-instructions.ll suite are passing.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 96358afd25c8490..59e1d3266622712 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2133,12 +2133,10 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
}
if (FromVT == MVT::v2f32) {
- assert(ToVT == MVT::i64);
-
// A bitcast to i64 from v2f32.
// See if we can legalize the operand.
const SDValue &Operand = Op->getOperand(0);
- if (Operand.getOpcode() == ISD::BUILD_VECTOR) {
+ if (ToVT == MVT::i64 && Operand.getOpcode() == ISD::BUILD_VECTOR) {
const SDValue &BVOp0 = Operand.getOperand(0);
const SDValue &BVOp1 = Operand.getOperand(1);
@@ -2267,11 +2265,11 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getSelectCC(DL, Index, DAG.getIntPtrConstant(0, DL), E0, E1,
ISD::CondCode::SETEQ);
};
- if (Vector.getOpcode() == ISD::BITCAST) {
+ if (SDValue Pair = Vector.getOperand(0);
+ Vector.getOpcode() == ISD::BITCAST &&
+ Pair.getOpcode() == ISD::BUILD_PAIR) {
// peek through v2f32 = bitcast (i64 = build_pair (i32 A, i32 B))
// where A:i32, B:i32 = CopyFromReg (i64 = F32X2 Operation ...)
- SDValue Pair = Vector.getOperand(0);
- assert(Pair.getOpcode() == ISD::BUILD_PAIR);
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(),
GetOperand(Pair, Index));
}
>From 038b838b549dc4b25d7c385e4f8784b4fb87fa8d Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 14:33:28 -0800
Subject: [PATCH 07/22] update f32x2-instructions test
---
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 662 ++++++++++--------
1 file changed, 362 insertions(+), 300 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 984598dd3fb1395..c2bbec6e7cd055f 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -28,12 +28,10 @@ define <2 x float> @test_ret_const() #0 {
define float @test_extract_0(<2 x float> %a) #0 {
; CHECK-LABEL: test_extract_0(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<2>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
-; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%f1, tmp}, %rd1; }
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_0_param_0];
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%e = extractelement <2 x float> %a, i32 0
@@ -43,13 +41,11 @@ define float @test_extract_0(<2 x float> %a) #0 {
define float @test_extract_1(<2 x float> %a) #0 {
; CHECK-LABEL: test_extract_1(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<2>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f32 %f<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
-; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %f1}, %rd1; }
-; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_1_param_0];
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
; CHECK-NEXT: ret;
%e = extractelement <2 x float> %a, i32 1
ret float %e
@@ -60,13 +56,12 @@ define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .f32 %f<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0];
-; CHECK-NEXT: ld.param.u64 %rd2, [test_extract_i_param_1];
-; CHECK-NEXT: setp.eq.s64 %p1, %rd2, 0;
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_i_param_0];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
+; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
; CHECK-NEXT: selp.f32 %f3, %f1, %f2, %p1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
; CHECK-NEXT: ret;
@@ -77,15 +72,24 @@ define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fadd(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fadd_param_1];
-; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
+; CHECK-NEXT: mov.b32 %r3, %f4;
+; CHECK-NEXT: mov.b32 %r4, %f3;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r6;
+; CHECK-NEXT: mov.b32 %f6, %r5;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
@@ -94,17 +98,21 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_0(
; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
-; CHECK-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r4;
+; CHECK-NEXT: mov.b32 %f4, %r3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
; CHECK-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
@@ -113,17 +121,21 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_1(
; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
-; CHECK-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r4;
+; CHECK-NEXT: mov.b32 %f4, %r3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
@@ -132,21 +144,34 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: test_fadd_v4(
; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-NEXT: .reg .f32 %f<13>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
+; CHECK-NEXT: mov.b32 %r1, %f4;
+; CHECK-NEXT: mov.b32 %r2, %f3;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: mov.b32 %r4, %f1;
; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
-; CHECK-NEXT: mov.b64 %rd4, {%f7, %f8};
-; CHECK-NEXT: mov.b64 %rd3, {%f5, %f6};
-; CHECK-NEXT: add.rn.f32x2 %rd5, %rd1, %rd3;
-; CHECK-NEXT: add.rn.f32x2 %rd6, %rd2, %rd4;
-; CHECK-NEXT: mov.b64 {%f9, %f10}, %rd6;
-; CHECK-NEXT: mov.b64 {%f11, %f12}, %rd5;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-NEXT: mov.b32 %r5, %f6;
+; CHECK-NEXT: mov.b32 %r6, %f5;
+; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5};
+; CHECK-NEXT: mov.b64 %rd4, {%r4, %r3};
+; CHECK-NEXT: add.rn.f32x2 %rd2, %rd4, %rd3;
+; CHECK-NEXT: mov.b32 %r7, %f8;
+; CHECK-NEXT: mov.b32 %r8, %f7;
+; CHECK-NEXT: mov.b64 %rd5, {%r8, %r7};
+; CHECK-NEXT: mov.b64 %rd6, {%r2, %r1};
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd6, %rd5;
+; CHECK-NEXT: mov.b64 {%r9, %r10}, %rd2;
+; CHECK-NEXT: mov.b64 {%r11, %r12}, %rd1;
+; CHECK-NEXT: mov.b32 %f9, %r12;
+; CHECK-NEXT: mov.b32 %f10, %r11;
+; CHECK-NEXT: mov.b32 %f11, %r10;
+; CHECK-NEXT: mov.b32 %f12, %r9;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -155,24 +180,29 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_0_v4(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<13>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<9>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
-; CHECK-NEXT: mov.f32 %f5, 0f40000000;
-; CHECK-NEXT: mov.f32 %f6, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-NEXT: add.rn.f32x2 %rd4, %rd1, %rd3;
-; CHECK-NEXT: mov.f32 %f7, 0f40800000;
-; CHECK-NEXT: mov.f32 %f8, 0f40400000;
-; CHECK-NEXT: mov.b64 %rd5, {%f8, %f7};
-; CHECK-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
-; CHECK-NEXT: mov.b64 {%f9, %f10}, %rd6;
-; CHECK-NEXT: mov.b64 {%f11, %f12}, %rd4;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-NEXT: mov.b32 %r1, %f4;
+; CHECK-NEXT: mov.b32 %r2, %f3;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: mov.b32 %r4, %f1;
+; CHECK-NEXT: mov.b64 %rd3, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
+; CHECK-NEXT: add.rn.f32x2 %rd2, %rd3, %rd4;
+; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
+; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r8;
+; CHECK-NEXT: mov.b32 %f6, %r7;
+; CHECK-NEXT: mov.b32 %f7, %r6;
+; CHECK-NEXT: mov.b32 %f8, %r5;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; CHECK-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -181,24 +211,29 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_1_v4(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<13>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<9>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
-; CHECK-NEXT: mov.f32 %f5, 0f40000000;
-; CHECK-NEXT: mov.f32 %f6, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-NEXT: add.rn.f32x2 %rd4, %rd1, %rd3;
-; CHECK-NEXT: mov.f32 %f7, 0f40800000;
-; CHECK-NEXT: mov.f32 %f8, 0f40400000;
-; CHECK-NEXT: mov.b64 %rd5, {%f8, %f7};
-; CHECK-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
-; CHECK-NEXT: mov.b64 {%f9, %f10}, %rd6;
-; CHECK-NEXT: mov.b64 {%f11, %f12}, %rd4;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-NEXT: mov.b32 %r1, %f4;
+; CHECK-NEXT: mov.b32 %r2, %f3;
+; CHECK-NEXT: mov.b32 %r3, %f2;
+; CHECK-NEXT: mov.b32 %r4, %f1;
+; CHECK-NEXT: mov.b64 %rd3, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
+; CHECK-NEXT: add.rn.f32x2 %rd2, %rd3, %rd4;
+; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
+; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r8;
+; CHECK-NEXT: mov.b32 %f6, %r7;
+; CHECK-NEXT: mov.b32 %f7, %r6;
+; CHECK-NEXT: mov.b32 %f8, %r5;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
@@ -207,15 +242,24 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fsub(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fsub_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fsub_param_1];
-; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
+; CHECK-NEXT: mov.b32 %r3, %f4;
+; CHECK-NEXT: mov.b32 %r4, %f3;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r6;
+; CHECK-NEXT: mov.b32 %f6, %r5;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
@@ -224,16 +268,21 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
define <2 x float> @test_fneg(<2 x float> %a) #0 {
; CHECK-LABEL: test_fneg(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<4>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0];
-; CHECK-NEXT: mov.f32 %f1, 0f00000000;
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f1};
-; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
-; CHECK-NEXT: mov.b64 {%f2, %f3}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f3};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd3, 0;
+; CHECK-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r4;
+; CHECK-NEXT: mov.b32 %f4, %r3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
; CHECK-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
@@ -242,15 +291,24 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fmul(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fmul_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fmul_param_1];
-; CHECK-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
+; CHECK-NEXT: mov.b32 %r3, %f4;
+; CHECK-NEXT: mov.b32 %r4, %f3;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT: mul.rn.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r6;
+; CHECK-NEXT: mov.b32 %f6, %r5;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
@@ -260,15 +318,12 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fdiv(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
-; CHECK-NEXT: div.rn.f32 %f6, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_param_0];
+; CHECK-NEXT: div.rn.f32 %f5, %f2, %f4;
+; CHECK-NEXT: div.rn.f32 %f6, %f1, %f3;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fdiv <2 x float> %a, %b
@@ -280,25 +335,22 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<15>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_param_0];
+; CHECK-NEXT: div.rn.f32 %f5, %f2, %f4;
; CHECK-NEXT: cvt.rzi.f32.f32 %f6, %f5;
-; CHECK-NEXT: mul.f32 %f7, %f6, %f2;
-; CHECK-NEXT: sub.f32 %f8, %f4, %f7;
-; CHECK-NEXT: testp.infinite.f32 %p1, %f2;
-; CHECK-NEXT: selp.f32 %f9, %f4, %f8, %p1;
-; CHECK-NEXT: div.rn.f32 %f10, %f3, %f1;
+; CHECK-NEXT: mul.f32 %f7, %f6, %f4;
+; CHECK-NEXT: sub.f32 %f8, %f2, %f7;
+; CHECK-NEXT: testp.infinite.f32 %p1, %f4;
+; CHECK-NEXT: selp.f32 %f9, %f2, %f8, %p1;
+; CHECK-NEXT: div.rn.f32 %f10, %f1, %f3;
; CHECK-NEXT: cvt.rzi.f32.f32 %f11, %f10;
-; CHECK-NEXT: mul.f32 %f12, %f11, %f1;
-; CHECK-NEXT: sub.f32 %f13, %f3, %f12;
-; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
-; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-NEXT: mul.f32 %f12, %f11, %f3;
+; CHECK-NEXT: sub.f32 %f13, %f1, %f12;
+; CHECK-NEXT: testp.infinite.f32 %p2, %f3;
+; CHECK-NEXT: selp.f32 %f14, %f1, %f13, %p2;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
; CHECK-NEXT: ret;
%r = frem <2 x float> %a, %b
@@ -309,13 +361,12 @@ define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: test_ldst_v2f32(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
-; CHECK-NEXT: ld.b64 %rd3, [%rd1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
; CHECK-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
; CHECK-NEXT: ret;
%t1 = load <2 x float>, ptr %a
@@ -330,8 +381,8 @@ define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
; CHECK-NEXT: ld.u64 %rd3, [%rd1];
; CHECK-NEXT: ld.f32 %f1, [%rd1+8];
; CHECK-NEXT: st.f32 [%rd2+8], %f1;
@@ -349,8 +400,8 @@ define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
; CHECK-NEXT: ret;
@@ -366,8 +417,8 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
; CHECK-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
; CHECK-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
@@ -384,13 +435,10 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_call(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1];
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd2;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_param_0];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
@@ -415,18 +463,15 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_call_flipped(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0];
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_flipped_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_flipped_param_0];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
; CHECK-NEXT: .param .align 8 .b8 retval0[8];
; CHECK-NEXT: call.uni (retval0),
; CHECK-NEXT: test_callee,
@@ -446,18 +491,15 @@ define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_tailcall_flipped(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_tailcall_flipped_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_tailcall_flipped_param_0];
; CHECK-NEXT: { // callseq 2, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
; CHECK-NEXT: .param .align 8 .b8 retval0[8];
; CHECK-NEXT: call.uni (retval0),
; CHECK-NEXT: test_callee,
@@ -478,18 +520,17 @@ define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1];
; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_param_0];
+; CHECK-NEXT: selp.f32 %f5, %f2, %f4, %p1;
+; CHECK-NEXT: selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = select i1 %c, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
@@ -500,21 +541,16 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<11>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1];
-; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2];
-; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd4;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
-; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
-; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
-; CHECK-NEXT: mov.b64 {%f5, %f6}, %rd2;
-; CHECK-NEXT: mov.b64 {%f7, %f8}, %rd1;
-; CHECK-NEXT: selp.f32 %f9, %f8, %f6, %p2;
-; CHECK-NEXT: selp.f32 %f10, %f7, %f5, %p1;
+; CHECK-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_param_3];
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_select_cc_param_2];
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_param_0];
+; CHECK-NEXT: setp.neu.f32 %p1, %f5, %f7;
+; CHECK-NEXT: setp.neu.f32 %p2, %f6, %f8;
+; CHECK-NEXT: selp.f32 %f9, %f2, %f4, %p2;
+; CHECK-NEXT: selp.f32 %f10, %f1, %f3, %p1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
; CHECK-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
@@ -527,18 +563,15 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-NEXT: .reg .f64 %fd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f64_f32_param_3];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f64_f32_param_2];
; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f64_f32_param_2];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f64_f32_param_3];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
-; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3;
+; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4;
; CHECK-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
; CHECK-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
@@ -553,20 +586,17 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-NEXT: .reg .f64 %fd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
-; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f64_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f64_param_0];
; CHECK-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
; CHECK-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: selp.f32 %f5, %f4, %f2, %p2;
-; CHECK-NEXT: selp.f32 %f6, %f3, %f1, %p1;
+; CHECK-NEXT: selp.f32 %f5, %f2, %f4, %p2;
+; CHECK-NEXT: selp.f32 %f6, %f1, %f3, %p1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%cc = fcmp une <2 x double> %c, %d
@@ -580,15 +610,12 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.neu.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.neu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_une_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_une_param_0];
+; CHECK-NEXT: setp.neu.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.neu.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -604,15 +631,12 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.equ.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.equ.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ueq_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ueq_param_0];
+; CHECK-NEXT: setp.equ.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.equ.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -628,15 +652,12 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.gtu.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.gtu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ugt_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ugt_param_0];
+; CHECK-NEXT: setp.gtu.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.gtu.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -652,15 +673,12 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.geu.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.geu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uge_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uge_param_0];
+; CHECK-NEXT: setp.geu.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.geu.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -676,15 +694,12 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.ltu.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.ltu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ult_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ult_param_0];
+; CHECK-NEXT: setp.ltu.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.ltu.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -700,15 +715,12 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.leu.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.leu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ule_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ule_param_0];
+; CHECK-NEXT: setp.leu.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.leu.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -724,15 +736,12 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.nan.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.nan.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uno_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uno_param_0];
+; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.nan.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -748,15 +757,12 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.ne.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.ne.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_one_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_one_param_0];
+; CHECK-NEXT: setp.ne.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.ne.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -772,15 +778,12 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.eq.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.eq.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oeq_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oeq_param_0];
+; CHECK-NEXT: setp.eq.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.eq.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -796,15 +799,12 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.gt.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.gt.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ogt_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ogt_param_0];
+; CHECK-NEXT: setp.gt.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.gt.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -820,15 +820,12 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.ge.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.ge.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oge_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oge_param_0];
+; CHECK-NEXT: setp.ge.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.ge.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -844,15 +841,12 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.lt.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.lt.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_olt_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_olt_param_0];
+; CHECK-NEXT: setp.lt.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.lt.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -868,15 +862,12 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.le.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.le.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ole_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ole_param_0];
+; CHECK-NEXT: setp.le.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.le.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -892,15 +883,12 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-NEXT: setp.num.f32 %p1, %f4, %f2;
-; CHECK-NEXT: setp.num.f32 %p2, %f3, %f1;
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ord_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ord_param_0];
+; CHECK-NEXT: setp.num.f32 %p1, %f2, %f4;
+; CHECK-NEXT: setp.num.f32 %p2, %f1, %f3;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
@@ -915,11 +903,9 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i32_param_0];
; CHECK-NEXT: cvt.rzi.s32.f32 %r1, %f2;
; CHECK-NEXT: cvt.rzi.s32.f32 %r2, %f1;
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
@@ -932,14 +918,13 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
; CHECK-LABEL: test_fptosi_i64(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
-; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i64_param_0];
+; CHECK-NEXT: cvt.rzi.s64.f32 %rd1, %f2;
+; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %f1;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
; CHECK-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i64>
ret <2 x i64> %r
@@ -950,11 +935,9 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi32_param_0];
; CHECK-NEXT: cvt.rzi.u32.f32 %r1, %f2;
; CHECK-NEXT: cvt.rzi.u32.f32 %r2, %f1;
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
@@ -967,14 +950,13 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
; CHECK-LABEL: test_fptoui_2xi64(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
-; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT: cvt.rzi.u64.f32 %rd1, %f2;
+; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %f1;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
; CHECK-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i64>
ret <2 x i64> %r
@@ -1047,19 +1029,29 @@ define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_uitofp_2xi32_fadd(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_uitofp_2xi32_fadd_param_1];
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
-; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-NEXT: cvt.rn.f32.u32 %f3, %r1;
+; CHECK-NEXT: mov.b32 %r3, %f3;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r3;
+; CHECK-NEXT: cvt.rn.f32.u32 %f4, %r2;
+; CHECK-NEXT: mov.b32 %r4, %f4;
+; CHECK-NEXT: cvt.u64.u32 %rd3, %r4;
+; CHECK-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-NEXT: mov.b32 %r5, %f2;
+; CHECK-NEXT: mov.b32 %r6, %f1;
+; CHECK-NEXT: mov.b64 %rd6, {%r6, %r5};
+; CHECK-NEXT: add.rn.f32x2 %rd1, %rd6, %rd5;
+; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r8;
+; CHECK-NEXT: mov.b32 %f6, %r7;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
@@ -1083,26 +1075,96 @@ define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
}
define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT: cvt.f64.f32 %fd1, %f2;
+; CHECK-NEXT: cvt.f64.f32 %fd2, %f1;
+; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-NEXT: ret;
%r = fpext <2 x float> %a to <2 x double>
ret <2 x double> %r
}
define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f2;
+; CHECK-NEXT: mov.b32 %r2, %f1;
+; CHECK-NEXT: mov.b64 %rd1, {%r2, %r1};
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-NEXT: ret;
%r = bitcast <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: mov.b32 %f2, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
%r = bitcast <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
+; CHECK-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f64 %fd1, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-NEXT: mov.b64 %rd1, %fd1;
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
+; CHECK-NEXT: mov.b32 %f1, %r2;
+; CHECK-NEXT: mov.b32 %f2, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT: ret;
%r = bitcast double %a to <2 x float>
ret <2 x float> %r
}
define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-NEXT: .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-NEXT: mov.b32 %r1, %f1;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r1;
+; CHECK-NEXT: mov.b32 %r2, %f2;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r2;
+; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
+; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
+; CHECK-NEXT: mov.b64 %fd1, %rd4;
+; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT: ret;
%r = bitcast <2 x float> %a to double
ret double %r
}
>From bf588080ad15203d9f61350879a252684c05902a Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 18:12:51 -0800
Subject: [PATCH 08/22] add combiner rule for i64 or -> build_pair
Rule:
i64 or (i64 zext i32 X, i64 (shl (i32 ext Y), 32))
-> i64 build_pair (X, Y)
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 31 +++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 59e1d3266622712..da846363644fc53 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -874,6 +874,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::v2f32, Custom);
// Handle custom lowering for: f32 = extract_vector_elt v2f32
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+ // Combine:
+ // i64 = or (i64 = zero_extend X, i64 = shl (i64 = any_extend Y, 32))
+ // -> i64 = build_pair (X, Y)
+ setTargetDAGCombine(ISD::OR);
}
// These map to conversion instructions for scalar FP types.
@@ -5268,6 +5272,31 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
}
+static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // i64 = or (i64 = zero_extend A, i64 = shl (i64 = any_extend B, 32))
+ // -> i64 = build_pair (A, B)
+ if (N->getValueType(0) == MVT::i64 && Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op1.getOpcode() == ISD::SHL) {
+ SDValue SHLOp0 = Op1.getOperand(0);
+ SDValue SHLOp1 = Op1.getOperand(1);
+ if (const auto *Const = dyn_cast<ConstantSDNode>(SHLOp1);
+ Const && Const->getZExtValue() == 32 &&
+ SHLOp0.getOpcode() == ISD::ANY_EXTEND) {
+ SDLoc DL(N);
+ return DCI.DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+ {Op0.getOperand(0), SHLOp0.getOperand(0)});
+ }
+ }
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5302,6 +5331,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformVSELECTCombine(N, DCI);
case ISD::BUILD_VECTOR:
return PerformBUILD_VECTORCombine(N, DCI);
+ case ISD::OR:
+ return PerformORCombine(N, DCI, OptLevel);
}
return SDValue();
}
>From f643c3b159d12a92790d8282f204b910b78a1a3b Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 22:28:08 -0800
Subject: [PATCH 09/22] add combiner rule for i32 trunc (i64 srl (i64
build_pair (X, Y)), 32) -> i32 Y
Rule:
i32 = truncate (i64 = srl (i64 = build_pair (i32 X, i32 Y)), 32)
-> i32 Y
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 27 +++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index da846363644fc53..78d90069d52b8f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -878,6 +878,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// i64 = or (i64 = zero_extend X, i64 = shl (i64 = any_extend Y, 32))
// -> i64 = build_pair (X, Y)
setTargetDAGCombine(ISD::OR);
+ // i32 = truncate (i64 = srl (i64 = build_pair (X, Y), 32))
+ // -> i32 Y
+ setTargetDAGCombine(ISD::TRUNCATE);
}
// These map to conversion instructions for scalar FP types.
@@ -5297,6 +5300,28 @@ static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+static SDValue PerformTRUNCATECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() == ISD::SRL) {
+ SDValue SrlOp = Op.getOperand(0);
+ SDValue SrlSh = Op.getOperand(1);
+ // i32 = truncate (i64 = srl (i64 build_pair (A, B), 32))
+ // -> i32 A
+ if (const auto *Const = dyn_cast<ConstantSDNode>(SrlSh);
+ Const && Const->getZExtValue() == 32) {
+ if (SrlOp.getOpcode() == ISD::BUILD_PAIR)
+ return SrlOp.getOperand(1);
+ }
+ }
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5333,6 +5358,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::OR:
return PerformORCombine(N, DCI, OptLevel);
+ case ISD::TRUNCATE:
+ return PerformTRUNCATECombine(N, DCI, OptLevel);
}
return SDValue();
}
>From e9868be2f8063f7e6493f00a6c5e9220ef32431a Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Mon, 3 Feb 2025 23:12:08 -0800
Subject: [PATCH 10/22] add combiner rule for i64 build_pair on
Copy{From,To}Reg
Rule:
i64 = build_pair ({i32, i32} = CopyFromReg (CopyToReg (i64 X)))
-> i64 X
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 +++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 78d90069d52b8f9..7e2d6f4284bba58 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -881,6 +881,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// i32 = truncate (i64 = srl (i64 = build_pair (X, Y), 32))
// -> i32 Y
setTargetDAGCombine(ISD::TRUNCATE);
+ // i64 = build_pair ({i32, i32} = CopyFromReg (CopyToReg (i64 X)))
+ // -> i64 X
+ setTargetDAGCombine(ISD::BUILD_PAIR);
}
// These map to conversion instructions for scalar FP types.
@@ -5322,6 +5325,31 @@ static SDValue PerformTRUNCATECombine(SDNode *N,
return SDValue();
}
+static SDValue PerformBUILD_PAIRCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ EVT ToVT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ // i64 = build_pair ({i32, i32} = CopyFromReg (CopyToReg (i64 X)))
+ // -> i64 X
+ if (ToVT == MVT::i64 && Op0.getOpcode() == ISD::CopyFromReg &&
+ Op1.getNode() == Op0.getNode() && Op0 != Op1) {
+ SDValue CFRChain = Op0.getOperand(0);
+ Register Reg = cast<RegisterSDNode>(Op0.getOperand(1))->getReg();
+ if (CFRChain.getOpcode() == ISD::CopyToReg &&
+ cast<RegisterSDNode>(CFRChain.getOperand(1))->getReg() == Reg) {
+ SDValue Value = CFRChain.getOperand(2);
+ return Value;
+ }
+ }
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5360,6 +5388,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformORCombine(N, DCI, OptLevel);
case ISD::TRUNCATE:
return PerformTRUNCATECombine(N, DCI, OptLevel);
+ case ISD::BUILD_PAIR:
+ return PerformBUILD_PAIRCombine(N, DCI, OptLevel);
}
return SDValue();
}
>From d4165bffd40f1af219619b330a50977375a7d4ca Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 4 Feb 2025 09:02:01 -0800
Subject: [PATCH 11/22] use V2F32toI64 for i64 build_pair (i32 bitcast f32, i32
bitcast f32)
Add a dag pattern to lower:
i64 = build_pair (i32 = bitcast (f32 A), i32 = bitcast (f32 B))
-> i64 = V2F32toI64 A, B
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 269573cd3d9ae1d..ee6b3db4a195bd3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3441,6 +3441,9 @@ def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
(V2I16toI32 $a, $b)>;
def : Pat<(v2f32 (build_vector f32:$a, f32:$b)),
(V2F32toI64 $a, $b)>;
+def : Pat<(i64 (build_pair (i32 (bitconvert f32:$a)),
+ (i32 (bitconvert f32:$b)))),
+ (V2F32toI64 $a, $b)>;
def : Pat<(i64 (build_pair i32:$a, i32:$b)),
(V2I32toI64 $a, $b)>;
>From 4af1276dbf3d9504db363bb9c42043da76a1f6f4 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 4 Feb 2025 22:30:46 -0800
Subject: [PATCH 12/22] add -O3 test case for f32x2
Currently breaking for test_extract_i case because parameter symbols
cannot appear in add instructions. We need an intermediate mov.
---
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index c2bbec6e7cd055f..4ec77b852744726 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -1,13 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; ## Full FP32x2 support enabled by default.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
-; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
; RUN: | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas %{ \
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 \
; RUN: %}
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O3 -verify-machineinstrs \
+; RUN: | FileCheck --check-prefixes=CHECK-O3 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O3 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 \
+; RUN: %}
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "nvptx64-nvidia-cuda"
>From 4a3de5e619124dfa1a97f30cab0633c1450b55eb Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 5 Feb 2025 09:37:52 -0800
Subject: [PATCH 13/22] remove unnecessary bitcast in constant eval
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7e2d6f4284bba58..46c302358dfc4bc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2162,8 +2162,7 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// cast two constants
APInt Value(64, 0);
Value = CastToAPInt(BVOp0) | CastToAPInt(BVOp1).shl(32);
- SDValue Const = DAG.getConstant(Value, DL, MVT::i64);
- return DAG.getBitcast(ToVT, Const);
+ return DAG.getConstant(Value, DL, MVT::i64);
}
// otherwise build an i64
>From 097ac44e1dffb4772e19309ed3f1a712255bbe33 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Feb 2025 14:08:32 -0800
Subject: [PATCH 14/22] enable ftz support
And temporarily disable -O3 in testing as it exposes an existing bug
with how test_extract_i() is lowered when optimized.
---
.../include/llvm/Target/TargetSelectionDAG.td | 3 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 30 --
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 -
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 65 ++-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 500 +++++++++++++-----
5 files changed, 426 insertions(+), 173 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 1e432f55ad4b6cc..394428594b98700 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -115,6 +115,9 @@ def SDTPtrAddOp : SDTypeProfile<1, 2, [ // ptradd
def SDTIntBinOp : SDTypeProfile<1, 2, [ // add, and, or, xor, udiv, etc.
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>
]>;
+def SDTIntTernaryOp : SDTypeProfile<1, 3, [ // fma32x2
+ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>
+]>;
def SDTIntShiftOp : SDTypeProfile<1, 2, [ // shl, sra, srl
SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>
]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6a604227326ae53..ea83ad449c10752 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -121,12 +121,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
case NVPTXISD::SETP_BF16X2:
SelectSETP_BF16X2(N);
return;
- case NVPTXISD::FADD_F32X2:
- case NVPTXISD::FSUB_F32X2:
- case NVPTXISD::FMUL_F32X2:
- case NVPTXISD::FMA_F32X2:
- SelectF32X2Op(N);
- return;
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
if (tryLoadVector(N))
@@ -305,30 +299,6 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
return true;
}
-void NVPTXDAGToDAGISel::SelectF32X2Op(SDNode *N) {
- unsigned Opcode;
- switch (N->getOpcode()) {
- case NVPTXISD::FADD_F32X2:
- Opcode = NVPTX::FADD_F32X2;
- break;
- case NVPTXISD::FSUB_F32X2:
- Opcode = NVPTX::FSUB_F32X2;
- break;
- case NVPTXISD::FMUL_F32X2:
- Opcode = NVPTX::FMUL_F32X2;
- break;
- case NVPTXISD::FMA_F32X2:
- Opcode = NVPTX::FMA_F32X2;
- break;
- default:
- llvm_unreachable("Unexpected opcode!");
- }
- SDLoc DL(N);
- SmallVector<SDValue> NewOps(N->ops());
- SDNode *NewNode = CurDAG->getMachineNode(Opcode, DL, MVT::i64, NewOps);
- ReplaceNode(N, NewNode);
-}
-
// Find all instances of extract_vector_elt that use this v2f16 vector
// and coalesce them into a scattering move instruction.
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 6792ccc2b168bd3..62e81d250d2f734 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -88,7 +88,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryConstantFP(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
- void SelectF32X2Op(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2402265368f4aed..0cf2b024fbba33d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1581,27 +1581,50 @@ def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
-// F32x2 ops (sm_100+)
-
-def FADD_F32X2 : NVPTXInst<(outs Int64Regs:$res),
- (ins Int64Regs:$a, Int64Regs:$b),
- "add.rn.f32x2 \t$res, $a, $b;", []>,
- Requires<[hasF32x2Instructions]>;
-
-def FSUB_F32X2 : NVPTXInst<(outs Int64Regs:$res),
- (ins Int64Regs:$a, Int64Regs:$b),
- "sub.rn.f32x2 \t$res, $a, $b;", []>,
- Requires<[hasF32x2Instructions]>;
-
-def FMUL_F32X2 : NVPTXInst<(outs Int64Regs:$res),
- (ins Int64Regs:$a, Int64Regs:$b),
- "mul.rn.f32x2 \t$res, $a, $b;", []>,
- Requires<[hasF32x2Instructions]>;
-
-def FMA_F32X2 : NVPTXInst<(outs Int64Regs:$res),
- (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
- "fma.rn.f32x2 \t$res, $a, $b;", []>,
- Requires<[hasF32x2Instructions]>;
+// packed f32 ops (sm_100+)
+class F32x2Op2<string OpcStr, Predicate Pred>
+: NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ OpcStr # ".f32x2 \t$res, $a, $b;", []>,
+ Requires<[hasF32x2Instructions, Pred]>;
+class F32x2Op3<string OpcStr, Predicate Pred>
+: NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+ OpcStr # ".f32x2 \t$res, $a, $b, $c;", []>,
+ Requires<[hasF32x2Instructions, Pred]>;
+
+def fadd32x2_nvptx : SDNode<"NVPTXISD::FADD_F32X2", SDTIntBinOp>;
+def fsub32x2_nvptx : SDNode<"NVPTXISD::FSUB_F32X2", SDTIntBinOp>;
+def fmul32x2_nvptx : SDNode<"NVPTXISD::FMUL_F32X2", SDTIntBinOp>;
+def fma32x2_nvptx : SDNode<"NVPTXISD::FMA_F32X2", SDTIntTernaryOp>;
+
+def FADD32x2 : F32x2Op2<"add.rn", doNoF32FTZ>;
+def FSUB32x2 : F32x2Op2<"sub.rn", doNoF32FTZ>;
+def FMUL32x2 : F32x2Op2<"mul.rn", doNoF32FTZ>;
+def FMA32x2 : F32x2Op3<"fma.rn", doNoF32FTZ>;
+
+def : Pat<(fadd32x2_nvptx i64:$a, i64:$b),
+ (FADD32x2 $a, $b)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fsub32x2_nvptx i64:$a, i64:$b),
+ (FSUB32x2 $a, $b)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fmul32x2_nvptx i64:$a, i64:$b),
+ (FMUL32x2 $a, $b)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fma32x2_nvptx i64:$a, i64:$b, i64:$c),
+ (FMA32x2 $a, $b, $c)>, Requires<[doNoF32FTZ]>;
+
+def FADD32x2_ftz : F32x2Op2<"add.rn.ftz", doF32FTZ>;
+def FSUB32x2_ftz : F32x2Op2<"sub.rn.ftz", doF32FTZ>;
+def FMUL32x2_ftz : F32x2Op2<"mul.rn.ftz", doF32FTZ>;
+def FMA32x2_ftz : F32x2Op3<"fma.rn.ftz", doF32FTZ>;
+
+def : Pat<(fadd32x2_nvptx i64:$a, i64:$b),
+ (FADD32x2_ftz $a, $b)>, Requires<[doF32FTZ]>;
+def : Pat<(fsub32x2_nvptx i64:$a, i64:$b),
+ (FSUB32x2_ftz $a, $b)>, Requires<[doF32FTZ]>;
+def : Pat<(fmul32x2_nvptx i64:$a, i64:$b),
+ (FMUL32x2_ftz $a, $b)>, Requires<[doF32FTZ]>;
+def : Pat<(fma32x2_nvptx i64:$a, i64:$b, i64:$c),
+ (FMA32x2_ftz $a, $b, $c)>, Requires<[doF32FTZ]>;
//
// BFIND
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 4ec77b852744726..2c28c4121405424 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -8,14 +8,6 @@
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 \
; RUN: %}
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
-; RUN: -O3 -verify-machineinstrs \
-; RUN: | FileCheck --check-prefixes=CHECK-O3 %s
-; RUN: %if ptxas %{ \
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
-; RUN: -O3 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | %ptxas-verify -arch=sm_100 \
-; RUN: %}
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "nvptx64-nvidia-cuda"
@@ -80,23 +72,19 @@ define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fadd(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
-; CHECK-NEXT: mov.b32 %r3, %f4;
-; CHECK-NEXT: mov.b32 %r4, %f3;
-; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
-; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r6;
-; CHECK-NEXT: mov.b32 %f6, %r5;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r2;
+; CHECK-NEXT: mov.b32 %f6, %r1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, %b
@@ -106,20 +94,18 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_0(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
-; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: mov.b32 %f4, %r3;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r2;
+; CHECK-NEXT: mov.b32 %f4, %r1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
; CHECK-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
@@ -129,20 +115,18 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_1(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
-; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: mov.b32 %f4, %r3;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r2;
+; CHECK-NEXT: mov.b32 %f4, %r1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
@@ -152,33 +136,25 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: test_fadd_v4(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .f32 %f<13>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f4;
-; CHECK-NEXT: mov.b32 %r2, %f3;
-; CHECK-NEXT: mov.b32 %r3, %f2;
-; CHECK-NEXT: mov.b32 %r4, %f1;
; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
-; CHECK-NEXT: mov.b32 %r5, %f6;
-; CHECK-NEXT: mov.b32 %r6, %f5;
-; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5};
-; CHECK-NEXT: mov.b64 %rd4, {%r4, %r3};
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
+; CHECK-NEXT: mov.b64 %rd3, {%f5, %f6};
+; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
; CHECK-NEXT: add.rn.f32x2 %rd2, %rd4, %rd3;
-; CHECK-NEXT: mov.b32 %r7, %f8;
-; CHECK-NEXT: mov.b32 %r8, %f7;
-; CHECK-NEXT: mov.b64 %rd5, {%r8, %r7};
-; CHECK-NEXT: mov.b64 %rd6, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd5, {%f7, %f8};
+; CHECK-NEXT: mov.b64 %rd6, {%f3, %f4};
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd6, %rd5;
-; CHECK-NEXT: mov.b64 {%r9, %r10}, %rd2;
-; CHECK-NEXT: mov.b64 {%r11, %r12}, %rd1;
-; CHECK-NEXT: mov.b32 %f9, %r12;
-; CHECK-NEXT: mov.b32 %f10, %r11;
-; CHECK-NEXT: mov.b32 %f11, %r10;
-; CHECK-NEXT: mov.b32 %f12, %r9;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f9, %r4;
+; CHECK-NEXT: mov.b32 %f10, %r3;
+; CHECK-NEXT: mov.b32 %f11, %r2;
+; CHECK-NEXT: mov.b32 %f12, %r1;
; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, %b
@@ -188,28 +164,24 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_0_v4(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .f32 %f<9>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f4;
-; CHECK-NEXT: mov.b32 %r2, %f3;
-; CHECK-NEXT: mov.b32 %r3, %f2;
-; CHECK-NEXT: mov.b32 %r4, %f1;
-; CHECK-NEXT: mov.b64 %rd3, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
; CHECK-NEXT: add.rn.f32x2 %rd2, %rd3, %rd4;
-; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r8;
-; CHECK-NEXT: mov.b32 %f6, %r7;
-; CHECK-NEXT: mov.b32 %f7, %r6;
-; CHECK-NEXT: mov.b32 %f8, %r5;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r4;
+; CHECK-NEXT: mov.b32 %f6, %r3;
+; CHECK-NEXT: mov.b32 %f7, %r2;
+; CHECK-NEXT: mov.b32 %f8, %r1;
; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; CHECK-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
@@ -219,28 +191,24 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_1_v4(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .f32 %f<9>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f4;
-; CHECK-NEXT: mov.b32 %r2, %f3;
-; CHECK-NEXT: mov.b32 %r3, %f2;
-; CHECK-NEXT: mov.b32 %r4, %f1;
-; CHECK-NEXT: mov.b64 %rd3, {%r4, %r3};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
; CHECK-NEXT: add.rn.f32x2 %rd2, %rd3, %rd4;
-; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r8;
-; CHECK-NEXT: mov.b32 %f6, %r7;
-; CHECK-NEXT: mov.b32 %f7, %r6;
-; CHECK-NEXT: mov.b32 %f8, %r5;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r4;
+; CHECK-NEXT: mov.b32 %f6, %r3;
+; CHECK-NEXT: mov.b32 %f7, %r2;
+; CHECK-NEXT: mov.b32 %f8, %r1;
; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -250,23 +218,19 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fsub(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
-; CHECK-NEXT: mov.b32 %r3, %f4;
-; CHECK-NEXT: mov.b32 %r4, %f3;
-; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
-; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r6;
-; CHECK-NEXT: mov.b32 %f6, %r5;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r2;
+; CHECK-NEXT: mov.b32 %f6, %r1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fsub <2 x float> %a, %b
@@ -276,20 +240,18 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
define <2 x float> @test_fneg(<2 x float> %a) #0 {
; CHECK-LABEL: test_fneg(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
-; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1};
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-NEXT: mov.b64 %rd3, 0;
; CHECK-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r4;
-; CHECK-NEXT: mov.b32 %f4, %r3;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r2;
+; CHECK-NEXT: mov.b32 %f4, %r1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
; CHECK-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
@@ -299,29 +261,49 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fmul(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
-; CHECK-NEXT: mov.b32 %r3, %f4;
-; CHECK-NEXT: mov.b32 %r4, %f3;
-; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
-; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-NEXT: mul.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r6;
-; CHECK-NEXT: mov.b32 %f6, %r5;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r2;
+; CHECK-NEXT: mov.b32 %f6, %r1;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
+define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+; CHECK-LABEL: test_fma(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_param_2];
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f5, %f6};
+; CHECK-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-NEXT: fma.rn.f32x2 %rd1, %rd4, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f7, %r2;
+; CHECK-NEXT: mov.b32 %f8, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-NEXT: ret;
+ %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+ ret <2 x float> %r
+}
+
define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fdiv(
; CHECK: {
@@ -365,6 +347,284 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
ret <2 x float> %r
}
+define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_fadd_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r2;
+; CHECK-NEXT: mov.b32 %f6, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fadd <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
+; CHECK-LABEL: test_fadd_imm_0_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r2;
+; CHECK-NEXT: mov.b32 %f4, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %r = fadd <2 x float> <float 1.0, float 2.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
+; CHECK-LABEL: test_fadd_imm_1_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r2;
+; CHECK-NEXT: mov.b32 %f4, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %r = fadd <2 x float> %a, <float 1.0, float 2.0>
+ ret <2 x float> %r
+}
+
+define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
+; CHECK-LABEL: test_fadd_v4_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<13>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd3, {%f5, %f6};
+; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd3;
+; CHECK-NEXT: mov.b64 %rd5, {%f7, %f8};
+; CHECK-NEXT: mov.b64 %rd6, {%f3, %f4};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd5;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f9, %r4;
+; CHECK-NEXT: mov.b32 %f10, %r3;
+; CHECK-NEXT: mov.b32 %f11, %r2;
+; CHECK-NEXT: mov.b32 %f12, %r1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-NEXT: ret;
+ %r = fadd <4 x float> %a, %b
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
+; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd4;
+; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r4;
+; CHECK-NEXT: mov.b32 %f6, %r3;
+; CHECK-NEXT: mov.b32 %f7, %r2;
+; CHECK-NEXT: mov.b32 %f8, %r1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
+; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd4;
+; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r4;
+; CHECK-NEXT: mov.b32 %f6, %r3;
+; CHECK-NEXT: mov.b32 %f7, %r2;
+; CHECK-NEXT: mov.b32 %f8, %r1;
+; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+ ret <4 x float> %r
+}
+
+define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_fsub_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r2;
+; CHECK-NEXT: mov.b32 %f6, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fsub <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
+; CHECK-LABEL: test_fneg_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-NEXT: mov.b64 %rd3, 0;
+; CHECK-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f3, %r2;
+; CHECK-NEXT: mov.b32 %f4, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT: ret;
+ %r = fsub <2 x float> <float 0.0, float 0.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_fmul_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-NEXT: mul.rn.ftz.f32x2 %rd1, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r2;
+; CHECK-NEXT: mov.b32 %f6, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fmul <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 {
+; CHECK-LABEL: test_fma_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_ftz_param_2];
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_ftz_param_0];
+; CHECK-NEXT: mov.b64 %rd2, {%f5, %f6};
+; CHECK-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-NEXT: fma.rn.ftz.f32x2 %rd1, %rd4, %rd3, %rd2;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: mov.b32 %f7, %r2;
+; CHECK-NEXT: mov.b32 %f8, %r1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-NEXT: ret;
+ %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_fdiv_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_ftz_param_0];
+; CHECK-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
+; CHECK-NEXT: div.rn.ftz.f32 %f6, %f1, %f3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-NEXT: ret;
+ %r = fdiv <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_frem_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<15>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_ftz_param_0];
+; CHECK-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
+; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
+; CHECK-NEXT: mul.ftz.f32 %f7, %f6, %f4;
+; CHECK-NEXT: sub.ftz.f32 %f8, %f2, %f7;
+; CHECK-NEXT: testp.infinite.f32 %p1, %f4;
+; CHECK-NEXT: selp.f32 %f9, %f2, %f8, %p1;
+; CHECK-NEXT: div.rn.ftz.f32 %f10, %f1, %f3;
+; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
+; CHECK-NEXT: mul.ftz.f32 %f12, %f11, %f3;
+; CHECK-NEXT: sub.ftz.f32 %f13, %f1, %f12;
+; CHECK-NEXT: testp.infinite.f32 %p2, %f3;
+; CHECK-NEXT: selp.f32 %f14, %f1, %f13, %p2;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
+; CHECK-NEXT: ret;
+ %r = frem <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: test_ldst_v2f32(
; CHECK: {
@@ -1037,7 +1297,7 @@ define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_uitofp_2xi32_fadd(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .f32 %f<7>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
@@ -1052,13 +1312,11 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-NEXT: cvt.u64.u32 %rd3, %r4;
; CHECK-NEXT: shl.b64 %rd4, %rd3, 32;
; CHECK-NEXT: or.b64 %rd5, %rd2, %rd4;
-; CHECK-NEXT: mov.b32 %r5, %f2;
-; CHECK-NEXT: mov.b32 %r6, %f1;
-; CHECK-NEXT: mov.b64 %rd6, {%r6, %r5};
+; CHECK-NEXT: mov.b64 %rd6, {%f1, %f2};
; CHECK-NEXT: add.rn.f32x2 %rd1, %rd6, %rd5;
-; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r8;
-; CHECK-NEXT: mov.b32 %f6, %r7;
+; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NEXT: mov.b32 %f5, %r6;
+; CHECK-NEXT: mov.b32 %f6, %r5;
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
@@ -1101,17 +1359,16 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<3>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_2xi32_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f2;
-; CHECK-NEXT: mov.b32 %r2, %f1;
-; CHECK-NEXT: mov.b64 %rd1, {%r2, %r1};
-; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-NEXT: mov.b32 %r1, %f1;
+; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
; CHECK-NEXT: ret;
%r = bitcast <2 x float> %a to <2 x i32>
ret <2 x i32> %r
@@ -1179,3 +1436,4 @@ define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
attributes #0 = { nounwind }
attributes #1 = { "unsafe-fp-math" = "true" }
+attributes #2 = { "denormal-fp-math"="preserve-sign" }
>From a83c6ece2b34f1b63e75c206313a6f92270f5c06 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Feb 2025 14:23:59 -0800
Subject: [PATCH 15/22] remove unnecessary TLI override
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 7 -------
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 3 ---
2 files changed, 10 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 46c302358dfc4bc..331867a497b7272 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3137,13 +3137,6 @@ bool NVPTXTargetLowering::splitValueIntoRegisterParts(
return false;
}
-const TargetRegisterClass *
-NVPTXTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
- if (VT == MVT::v2f32)
- return &NVPTX::Int64RegsRegClass;
- return TargetLowering::getRegClassFor(VT, isDivergent);
-}
-
// This creates target external symbol for a function parameter.
// Name of the symbol is composed from its index and the function name.
// Negative index corresponds to special parameter (unsized array) used for
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index f41902fbcaf99a9..8fd4ded42a238a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -315,9 +315,6 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue *Parts, unsigned NumParts, MVT PartVT,
std::optional<CallingConv::ID> CC) const override;
- const TargetRegisterClass *getRegClassFor(MVT VT,
- bool isDivergent) const override;
-
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
>From 9457c92221a51bf79a433a196ae71edc4627eb24 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Feb 2025 17:30:07 -0800
Subject: [PATCH 16/22] don't custom lower i64 = bitcast v2f32
This is unnecessary. DAGCombiner already has rules for shift patterns.
We also don't care about having the most optimal code in -O0. Finally,
introducing build_pair at this stage defeats an existing peephole
optimization in DAGCombiner.
We also update the test case with -O3 compilation.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 70 +-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 3537 ++++++++++++-----
2 files changed, 2476 insertions(+), 1131 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 331867a497b7272..d5c3a92a395941e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -870,8 +870,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Handle custom lowering for: v2f32 = OP v2f32, v2f32
for (const auto &Op : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FMA})
setOperationAction(Op, MVT::v2f32, Custom);
- // Handle custom lowering for: i64 = bitcast v2f32
- setOperationAction(ISD::BITCAST, MVT::v2f32, Custom);
// Handle custom lowering for: f32 = extract_vector_elt v2f32
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
// Combine:
@@ -2123,58 +2121,24 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
EVT FromVT = Op->getOperand(0)->getValueType(0);
- EVT ToVT = Op->getValueType(0);
- SDLoc DL(Op);
-
- if (FromVT == MVT::v2i8) {
- // Pack vector elements into i16 and bitcast to final type
- SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
- SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
- SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
- SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
- SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
- SDValue AsInt = DAG.getNode(
- ISD::OR, DL, MVT::i16,
- {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
- EVT ToVT = Op->getValueType(0);
- return MaybeBitcast(DAG, DL, ToVT, AsInt);
- }
-
- if (FromVT == MVT::v2f32) {
- // A bitcast to i64 from v2f32.
- // See if we can legalize the operand.
- const SDValue &Operand = Op->getOperand(0);
- if (ToVT == MVT::i64 && Operand.getOpcode() == ISD::BUILD_VECTOR) {
- const SDValue &BVOp0 = Operand.getOperand(0);
- const SDValue &BVOp1 = Operand.getOperand(1);
-
- auto CastToAPInt = [](SDValue Op) -> APInt {
- if (Op->isUndef())
- return APInt(64, 0); // undef values default to 0
- return cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt().zext(
- 64);
- };
-
- if ((BVOp0->isUndef() || isa<ConstantFPSDNode>(BVOp0)) &&
- (BVOp1->isUndef() || isa<ConstantFPSDNode>(BVOp1))) {
- // cast two constants
- APInt Value(64, 0);
- Value = CastToAPInt(BVOp0) | CastToAPInt(BVOp1).shl(32);
- return DAG.getConstant(Value, DL, MVT::i64);
- }
-
- // otherwise build an i64
- return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
- DAG.getBitcast(MVT::i32, BVOp0),
- DAG.getBitcast(MVT::i32, BVOp1));
- }
-
- // Otherwise, let SelectionDAG expand the operand
- return SDValue();
+ if (FromVT != MVT::v2i8) {
+ return Op;
}
- return Op;
+
+ // Pack vector elements into i16 and bitcast to final type
+ SDLoc DL(Op);
+ SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
+ Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
+ SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
+ Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
+ SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
+ SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
+ SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
+ SDValue AsInt = DAG.getNode(
+ ISD::OR, DL, MVT::i16,
+ {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
+ EVT ToVT = Op->getValueType(0);
+ return MaybeBitcast(DAG, DL, ToVT, AsInt);
}
// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 2c28c4121405424..148c29517780597 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -2,696 +2,1478 @@
; ## Full FP32x2 support enabled by default.
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | FileCheck --check-prefixes=CHECK %s
+; RUN: | FileCheck --check-prefixes=CHECK-O0 %s
; RUN: %if ptxas %{ \
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 \
; RUN: %}
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O3 -verify-machineinstrs \
+; RUN: | FileCheck --check-prefixes=CHECK-O3 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O3 -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 \
+; RUN: %}
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "nvptx64-nvidia-cuda"
define <2 x float> @test_ret_const() #0 {
-; CHECK-LABEL: test_ret_const(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_ret_const(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ret_const(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
ret <2 x float> <float 1.0, float 2.0>
}
define float @test_extract_0(<2 x float> %a) #0 {
-; CHECK-LABEL: test_extract_0(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_0_param_0];
-; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_extract_0(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_0_param_0];
+; CHECK-O0-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_extract_0(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_0_param_0];
+; CHECK-O3-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-O3-NEXT: ret;
%e = extractelement <2 x float> %a, i32 0
ret float %e
}
define float @test_extract_1(<2 x float> %a) #0 {
-; CHECK-LABEL: test_extract_1(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_1_param_0];
-; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_extract_1(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_1_param_0];
+; CHECK-O0-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_extract_1(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_1_param_0];
+; CHECK-O3-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-O3-NEXT: ret;
%e = extractelement <2 x float> %a, i32 1
ret float %e
}
-define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
-; CHECK-LABEL: test_extract_i(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .f32 %f<4>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_i_param_0];
-; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
-; CHECK-NEXT: selp.f32 %f3, %f1, %f2, %p1;
-; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
-; CHECK-NEXT: ret;
- %e = extractelement <2 x float> %a, i64 %idx
- ret float %e
-}
+; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on
+; test_extract_i_param_0 where the symbol's address is not taken first (that
+; is, moved to a temporary)
+; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+; ; CHECK-LABEL: test_extract_i(
+; ; CHECK: {
+; ; CHECK-NEXT: .reg .pred %p<2>;
+; ; CHECK-NEXT: .reg .f32 %f<4>;
+; ; CHECK-NEXT: .reg .b64 %rd<2>;
+; ; CHECK-EMPTY:
+; ; CHECK-NEXT: // %bb.0:
+; ; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_i_param_0];
+; ; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
+; ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
+; ; CHECK-NEXT: selp.f32 %f3, %f1, %f2, %p1;
+; ; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
+; ; CHECK-NEXT: ret;
+; %e = extractelement <2 x float> %a, i64 %idx
+; ret float %e
+; }
define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fadd(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r2;
-; CHECK-NEXT: mov.b32 %f6, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r6;
+; CHECK-O0-NEXT: mov.b32 %f6, %r5;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r2;
+; CHECK-O3-NEXT: mov.b32 %f6, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r2;
-; CHECK-NEXT: mov.b32 %f4, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_0(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
+; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f3, %r4;
+; CHECK-O0-NEXT: mov.b32 %f4, %r3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f3, %r2;
+; CHECK-O3-NEXT: mov.b32 %f4, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r2;
-; CHECK-NEXT: mov.b32 %f4, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_1(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
+; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f3, %r4;
+; CHECK-O0-NEXT: mov.b32 %f4, %r3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f3, %r2;
+; CHECK-O3-NEXT: mov.b32 %f4, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
}
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_fadd_v4(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<13>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
-; CHECK-NEXT: mov.b64 %rd3, {%f5, %f6};
-; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-NEXT: add.rn.f32x2 %rd2, %rd4, %rd3;
-; CHECK-NEXT: mov.b64 %rd5, {%f7, %f8};
-; CHECK-NEXT: mov.b64 %rd6, {%f3, %f4};
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd6, %rd5;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f9, %r4;
-; CHECK-NEXT: mov.b32 %f10, %r3;
-; CHECK-NEXT: mov.b32 %f11, %r2;
-; CHECK-NEXT: mov.b32 %f12, %r1;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_v4(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<13>;
+; CHECK-O0-NEXT: .reg .f32 %f<13>;
+; CHECK-O0-NEXT: .reg .b64 %rd<19>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
+; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
+; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
+; CHECK-O0-NEXT: mov.b32 %r5, %f5;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd11, %r5;
+; CHECK-O0-NEXT: mov.b32 %r6, %f6;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd12, %r6;
+; CHECK-O0-NEXT: shl.b64 %rd13, %rd12, 32;
+; CHECK-O0-NEXT: or.b64 %rd14, %rd11, %rd13;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd10, %rd14;
+; CHECK-O0-NEXT: mov.b32 %r7, %f7;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd15, %r7;
+; CHECK-O0-NEXT: mov.b32 %r8, %f8;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd16, %r8;
+; CHECK-O0-NEXT: shl.b64 %rd17, %rd16, 32;
+; CHECK-O0-NEXT: or.b64 %rd18, %rd15, %rd17;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd6, %rd18;
+; CHECK-O0-NEXT: mov.b64 {%r9, %r10}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%r11, %r12}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f9, %r12;
+; CHECK-O0-NEXT: mov.b32 %f10, %r11;
+; CHECK-O0-NEXT: mov.b32 %f11, %r10;
+; CHECK-O0-NEXT: mov.b32 %f12, %r9;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_v4(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<13>;
+; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, {%f7, %f8};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f9, %r4;
+; CHECK-O3-NEXT: mov.b32 %f10, %r3;
+; CHECK-O3-NEXT: mov.b32 %f11, %r2;
+; CHECK-O3-NEXT: mov.b32 %f12, %r1;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
-; CHECK-LABEL: test_fadd_imm_0_v4(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
-; CHECK-NEXT: add.rn.f32x2 %rd2, %rd3, %rd4;
-; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r4;
-; CHECK-NEXT: mov.b32 %f6, %r3;
-; CHECK-NEXT: mov.b32 %f7, %r2;
-; CHECK-NEXT: mov.b32 %f8, %r1;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_0_v4(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<13>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
+; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
+; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
+; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r8;
+; CHECK-O0-NEXT: mov.b32 %f6, %r7;
+; CHECK-O0-NEXT: mov.b32 %f7, %r6;
+; CHECK-O0-NEXT: mov.b32 %f8, %r5;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0_v4(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r4;
+; CHECK-O3-NEXT: mov.b32 %f6, %r3;
+; CHECK-O3-NEXT: mov.b32 %f7, %r2;
+; CHECK-O3-NEXT: mov.b32 %f8, %r1;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
-; CHECK-LABEL: test_fadd_imm_1_v4(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
-; CHECK-NEXT: add.rn.f32x2 %rd2, %rd3, %rd4;
-; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r4;
-; CHECK-NEXT: mov.b32 %f6, %r3;
-; CHECK-NEXT: mov.b32 %f7, %r2;
-; CHECK-NEXT: mov.b32 %f8, %r1;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_1_v4(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<13>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
+; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
+; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
+; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r8;
+; CHECK-O0-NEXT: mov.b32 %f6, %r7;
+; CHECK-O0-NEXT: mov.b32 %f7, %r6;
+; CHECK-O0-NEXT: mov.b32 %f8, %r5;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1_v4(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r4;
+; CHECK-O3-NEXT: mov.b32 %f6, %r3;
+; CHECK-O3-NEXT: mov.b32 %f7, %r2;
+; CHECK-O3-NEXT: mov.b32 %f8, %r1;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
}
define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fsub(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r2;
-; CHECK-NEXT: mov.b32 %f6, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fsub(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: sub.rn.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r6;
+; CHECK-O0-NEXT: mov.b32 %f6, %r5;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fsub(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: sub.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r2;
+; CHECK-O3-NEXT: mov.b32 %f6, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fneg(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fneg(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd3, 0;
-; CHECK-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r2;
-; CHECK-NEXT: mov.b32 %f4, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fneg(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: mov.b64 %rd6, 0;
+; CHECK-O0-NEXT: sub.rn.f32x2 %rd1, %rd6, %rd5;
+; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f3, %r4;
+; CHECK-O0-NEXT: mov.b32 %f4, %r3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fneg(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, 0;
+; CHECK-O3-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f3, %r2;
+; CHECK-O3-NEXT: mov.b32 %f4, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fmul(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: mul.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r2;
-; CHECK-NEXT: mov.b32 %f6, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fmul(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: mul.rn.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r6;
+; CHECK-O0-NEXT: mov.b32 %f6, %r5;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fmul(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mul.rn.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r2;
+; CHECK-O3-NEXT: mov.b32 %f6, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
-; CHECK-LABEL: test_fma(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_param_2];
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f5, %f6};
-; CHECK-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-NEXT: fma.rn.f32x2 %rd1, %rd4, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f7, %r2;
-; CHECK-NEXT: mov.b32 %f8, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fma(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_param_2];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: mov.b32 %r5, %f5;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd10, %r5;
+; CHECK-O0-NEXT: mov.b32 %r6, %f6;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd11, %r6;
+; CHECK-O0-NEXT: shl.b64 %rd12, %rd11, 32;
+; CHECK-O0-NEXT: or.b64 %rd13, %rd10, %rd12;
+; CHECK-O0-NEXT: fma.rn.f32x2 %rd1, %rd5, %rd9, %rd13;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f7, %r8;
+; CHECK-O0-NEXT: mov.b32 %f8, %r7;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fma(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_param_2];
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f5, %f6};
+; CHECK-O3-NEXT: fma.rn.f32x2 %rd1, %rd2, %rd3, %rd4;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f7, %r2;
+; CHECK-O3-NEXT: mov.b32 %f8, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}
define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fdiv(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_param_0];
-; CHECK-NEXT: div.rn.f32 %f5, %f2, %f4;
-; CHECK-NEXT: div.rn.f32 %f6, %f1, %f3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fdiv(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_param_0];
+; CHECK-O0-NEXT: div.rn.f32 %f5, %f2, %f4;
+; CHECK-O0-NEXT: div.rn.f32 %f6, %f1, %f3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fdiv(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_param_1];
+; CHECK-O3-NEXT: div.rn.f32 %f5, %f2, %f4;
+; CHECK-O3-NEXT: div.rn.f32 %f6, %f1, %f3;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_frem(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .f32 %f<15>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_param_0];
-; CHECK-NEXT: div.rn.f32 %f5, %f2, %f4;
-; CHECK-NEXT: cvt.rzi.f32.f32 %f6, %f5;
-; CHECK-NEXT: mul.f32 %f7, %f6, %f4;
-; CHECK-NEXT: sub.f32 %f8, %f2, %f7;
-; CHECK-NEXT: testp.infinite.f32 %p1, %f4;
-; CHECK-NEXT: selp.f32 %f9, %f2, %f8, %p1;
-; CHECK-NEXT: div.rn.f32 %f10, %f1, %f3;
-; CHECK-NEXT: cvt.rzi.f32.f32 %f11, %f10;
-; CHECK-NEXT: mul.f32 %f12, %f11, %f3;
-; CHECK-NEXT: sub.f32 %f13, %f1, %f12;
-; CHECK-NEXT: testp.infinite.f32 %p2, %f3;
-; CHECK-NEXT: selp.f32 %f14, %f1, %f13, %p2;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_frem(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<15>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_param_0];
+; CHECK-O0-NEXT: div.rn.f32 %f5, %f2, %f4;
+; CHECK-O0-NEXT: cvt.rzi.f32.f32 %f6, %f5;
+; CHECK-O0-NEXT: mul.f32 %f7, %f6, %f4;
+; CHECK-O0-NEXT: sub.f32 %f8, %f2, %f7;
+; CHECK-O0-NEXT: testp.infinite.f32 %p1, %f4;
+; CHECK-O0-NEXT: selp.f32 %f9, %f2, %f8, %p1;
+; CHECK-O0-NEXT: div.rn.f32 %f10, %f1, %f3;
+; CHECK-O0-NEXT: cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-O0-NEXT: mul.f32 %f12, %f11, %f3;
+; CHECK-O0-NEXT: sub.f32 %f13, %f1, %f12;
+; CHECK-O0-NEXT: testp.infinite.f32 %p2, %f3;
+; CHECK-O0-NEXT: selp.f32 %f14, %f1, %f13, %p2;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_frem(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<15>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_param_1];
+; CHECK-O3-NEXT: div.rn.f32 %f5, %f2, %f4;
+; CHECK-O3-NEXT: cvt.rzi.f32.f32 %f6, %f5;
+; CHECK-O3-NEXT: mul.f32 %f7, %f6, %f4;
+; CHECK-O3-NEXT: sub.f32 %f8, %f2, %f7;
+; CHECK-O3-NEXT: testp.infinite.f32 %p1, %f4;
+; CHECK-O3-NEXT: selp.f32 %f9, %f2, %f8, %p1;
+; CHECK-O3-NEXT: div.rn.f32 %f10, %f1, %f3;
+; CHECK-O3-NEXT: cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-O3-NEXT: mul.f32 %f12, %f11, %f3;
+; CHECK-O3-NEXT: sub.f32 %f13, %f1, %f12;
+; CHECK-O3-NEXT: testp.infinite.f32 %p2, %f3;
+; CHECK-O3-NEXT: selp.f32 %f14, %f1, %f13, %p2;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
+; CHECK-O3-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_fadd_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r2;
-; CHECK-NEXT: mov.b32 %f6, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_ftz_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r6;
+; CHECK-O0-NEXT: mov.b32 %f6, %r5;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r2;
+; CHECK-O3-NEXT: mov.b32 %f6, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
-; CHECK-LABEL: test_fadd_imm_0_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r2;
-; CHECK-NEXT: mov.b32 %f4, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_0_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
+; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f3, %r4;
+; CHECK-O0-NEXT: mov.b32 %f4, %r3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f3, %r2;
+; CHECK-O3-NEXT: mov.b32 %f4, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
-; CHECK-LABEL: test_fadd_imm_1_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r2;
-; CHECK-NEXT: mov.b32 %f4, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_1_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
+; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f3, %r4;
+; CHECK-O0-NEXT: mov.b32 %f4, %r3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f3, %r2;
+; CHECK-O3-NEXT: mov.b32 %f4, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
}
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
-; CHECK-LABEL: test_fadd_v4_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<13>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd3, {%f5, %f6};
-; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd3;
-; CHECK-NEXT: mov.b64 %rd5, {%f7, %f8};
-; CHECK-NEXT: mov.b64 %rd6, {%f3, %f4};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd5;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f9, %r4;
-; CHECK-NEXT: mov.b32 %f10, %r3;
-; CHECK-NEXT: mov.b32 %f11, %r2;
-; CHECK-NEXT: mov.b32 %f12, %r1;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_v4_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<13>;
+; CHECK-O0-NEXT: .reg .f32 %f<13>;
+; CHECK-O0-NEXT: .reg .b64 %rd<19>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
+; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
+; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
+; CHECK-O0-NEXT: mov.b32 %r5, %f5;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd11, %r5;
+; CHECK-O0-NEXT: mov.b32 %r6, %f6;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd12, %r6;
+; CHECK-O0-NEXT: shl.b64 %rd13, %rd12, 32;
+; CHECK-O0-NEXT: or.b64 %rd14, %rd11, %rd13;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd10, %rd14;
+; CHECK-O0-NEXT: mov.b32 %r7, %f7;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd15, %r7;
+; CHECK-O0-NEXT: mov.b32 %r8, %f8;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd16, %r8;
+; CHECK-O0-NEXT: shl.b64 %rd17, %rd16, 32;
+; CHECK-O0-NEXT: or.b64 %rd18, %rd15, %rd17;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd18;
+; CHECK-O0-NEXT: mov.b64 {%r9, %r10}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%r11, %r12}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f9, %r12;
+; CHECK-O0-NEXT: mov.b32 %f10, %r11;
+; CHECK-O0-NEXT: mov.b32 %f11, %r10;
+; CHECK-O0-NEXT: mov.b32 %f12, %r9;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_v4_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<13>;
+; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, {%f7, %f8};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f9, %r4;
+; CHECK-O3-NEXT: mov.b32 %f10, %r3;
+; CHECK-O3-NEXT: mov.b32 %f11, %r2;
+; CHECK-O3-NEXT: mov.b32 %f12, %r1;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
-; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd4;
-; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r4;
-; CHECK-NEXT: mov.b32 %f6, %r3;
-; CHECK-NEXT: mov.b32 %f7, %r2;
-; CHECK-NEXT: mov.b32 %f8, %r1;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<13>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
+; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
+; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
+; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r8;
+; CHECK-O0-NEXT: mov.b32 %f6, %r7;
+; CHECK-O0-NEXT: mov.b32 %f7, %r6;
+; CHECK-O0-NEXT: mov.b32 %f8, %r5;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r4;
+; CHECK-O3-NEXT: mov.b32 %f6, %r3;
+; CHECK-O3-NEXT: mov.b32 %f7, %r2;
+; CHECK-O3-NEXT: mov.b32 %f8, %r1;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
-; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd4, 4611686019492741120;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd4;
-; CHECK-NEXT: mov.b64 %rd5, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r4;
-; CHECK-NEXT: mov.b32 %f6, %r3;
-; CHECK-NEXT: mov.b32 %f7, %r2;
-; CHECK-NEXT: mov.b32 %f8, %r1;
-; CHECK-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<13>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
+; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
+; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
+; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r8;
+; CHECK-O0-NEXT: mov.b32 %f6, %r7;
+; CHECK-O0-NEXT: mov.b32 %f7, %r6;
+; CHECK-O0-NEXT: mov.b32 %f8, %r5;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r4;
+; CHECK-O3-NEXT: mov.b32 %f6, %r3;
+; CHECK-O3-NEXT: mov.b32 %f7, %r2;
+; CHECK-O3-NEXT: mov.b32 %f8, %r1;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
}
define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_fsub_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r2;
-; CHECK-NEXT: mov.b32 %f6, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fsub_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_ftz_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r6;
+; CHECK-O0-NEXT: mov.b32 %f6, %r5;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fsub_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r2;
+; CHECK-O3-NEXT: mov.b32 %f6, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
-; CHECK-LABEL: test_fneg_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f1, %f2};
-; CHECK-NEXT: mov.b64 %rd3, 0;
-; CHECK-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f3, %r2;
-; CHECK-NEXT: mov.b32 %f4, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fneg_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: mov.b64 %rd6, 0;
+; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd1, %rd6, %rd5;
+; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f3, %r4;
+; CHECK-O0-NEXT: mov.b32 %f4, %r3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fneg_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, 0;
+; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f3, %r2;
+; CHECK-O3-NEXT: mov.b32 %f4, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_fmul_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd3, {%f1, %f2};
-; CHECK-NEXT: mul.rn.ftz.f32x2 %rd1, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r2;
-; CHECK-NEXT: mov.b32 %f6, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fmul_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_ftz_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: mul.rn.ftz.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r6;
+; CHECK-O0-NEXT: mov.b32 %f6, %r5;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fmul_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: mul.rn.ftz.f32x2 %rd1, %rd2, %rd3;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r2;
+; CHECK-O3-NEXT: mov.b32 %f6, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 {
-; CHECK-LABEL: test_fma_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_ftz_param_2];
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_ftz_param_0];
-; CHECK-NEXT: mov.b64 %rd2, {%f5, %f6};
-; CHECK-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-NEXT: fma.rn.ftz.f32x2 %rd1, %rd4, %rd3, %rd2;
-; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-NEXT: mov.b32 %f7, %r2;
-; CHECK-NEXT: mov.b32 %f8, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fma_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_ftz_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_ftz_param_2];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_ftz_param_1];
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: mov.b32 %r5, %f5;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd10, %r5;
+; CHECK-O0-NEXT: mov.b32 %r6, %f6;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd11, %r6;
+; CHECK-O0-NEXT: shl.b64 %rd12, %rd11, 32;
+; CHECK-O0-NEXT: or.b64 %rd13, %rd10, %rd12;
+; CHECK-O0-NEXT: fma.rn.ftz.f32x2 %rd1, %rd5, %rd9, %rd13;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f7, %r8;
+; CHECK-O0-NEXT: mov.b32 %f8, %r7;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fma_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_ftz_param_0];
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fma_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_ftz_param_2];
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f5, %f6};
+; CHECK-O3-NEXT: fma.rn.ftz.f32x2 %rd1, %rd2, %rd3, %rd4;
+; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f7, %r2;
+; CHECK-O3-NEXT: mov.b32 %f8, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}
define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_fdiv_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_ftz_param_0];
-; CHECK-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
-; CHECK-NEXT: div.rn.ftz.f32 %f6, %f1, %f3;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fdiv_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_ftz_param_0];
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f6, %f1, %f3;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fdiv_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fdiv_ftz_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fdiv_ftz_param_1];
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f6, %f1, %f3;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-LABEL: test_frem_ftz(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .f32 %f<15>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_ftz_param_0];
-; CHECK-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
-; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
-; CHECK-NEXT: mul.ftz.f32 %f7, %f6, %f4;
-; CHECK-NEXT: sub.ftz.f32 %f8, %f2, %f7;
-; CHECK-NEXT: testp.infinite.f32 %p1, %f4;
-; CHECK-NEXT: selp.f32 %f9, %f2, %f8, %p1;
-; CHECK-NEXT: div.rn.ftz.f32 %f10, %f1, %f3;
-; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
-; CHECK-NEXT: mul.ftz.f32 %f12, %f11, %f3;
-; CHECK-NEXT: sub.ftz.f32 %f13, %f1, %f12;
-; CHECK-NEXT: testp.infinite.f32 %p2, %f3;
-; CHECK-NEXT: selp.f32 %f14, %f1, %f13, %p2;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_frem_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<15>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_ftz_param_0];
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
+; CHECK-O0-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
+; CHECK-O0-NEXT: mul.ftz.f32 %f7, %f6, %f4;
+; CHECK-O0-NEXT: sub.ftz.f32 %f8, %f2, %f7;
+; CHECK-O0-NEXT: testp.infinite.f32 %p1, %f4;
+; CHECK-O0-NEXT: selp.f32 %f9, %f2, %f8, %p1;
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f10, %f1, %f3;
+; CHECK-O0-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
+; CHECK-O0-NEXT: mul.ftz.f32 %f12, %f11, %f3;
+; CHECK-O0-NEXT: sub.ftz.f32 %f13, %f1, %f12;
+; CHECK-O0-NEXT: testp.infinite.f32 %p2, %f3;
+; CHECK-O0-NEXT: selp.f32 %f14, %f1, %f13, %p2;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_frem_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<15>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_frem_ftz_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_frem_ftz_param_1];
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f5, %f2, %f4;
+; CHECK-O3-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
+; CHECK-O3-NEXT: mul.ftz.f32 %f7, %f6, %f4;
+; CHECK-O3-NEXT: sub.ftz.f32 %f8, %f2, %f7;
+; CHECK-O3-NEXT: testp.infinite.f32 %p1, %f4;
+; CHECK-O3-NEXT: selp.f32 %f9, %f2, %f8, %p1;
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f10, %f1, %f3;
+; CHECK-O3-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
+; CHECK-O3-NEXT: mul.ftz.f32 %f12, %f11, %f3;
+; CHECK-O3-NEXT: sub.ftz.f32 %f13, %f1, %f12;
+; CHECK-O3-NEXT: testp.infinite.f32 %p2, %f3;
+; CHECK-O3-NEXT: selp.f32 %f14, %f1, %f13, %p2;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
+; CHECK-O3-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v2f32(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
-; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_ldst_v2f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-O0-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-O0-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v2f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-O3-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-O3-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
+; CHECK-O3-NEXT: ret;
%t1 = load <2 x float>, ptr %a
store <2 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v3f32(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<2>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
-; CHECK-NEXT: ld.u64 %rd3, [%rd1];
-; CHECK-NEXT: ld.f32 %f1, [%rd1+8];
-; CHECK-NEXT: st.f32 [%rd2+8], %f1;
-; CHECK-NEXT: st.u64 [%rd2], %rd3;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_ldst_v3f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<2>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-O0-NEXT: ld.u64 %rd3, [%rd1];
+; CHECK-O0-NEXT: ld.f32 %f1, [%rd1+8];
+; CHECK-O0-NEXT: st.f32 [%rd2+8], %f1;
+; CHECK-O0-NEXT: st.u64 [%rd2], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v3f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<2>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-O3-NEXT: ld.u64 %rd2, [%rd1];
+; CHECK-O3-NEXT: ld.f32 %f1, [%rd1+8];
+; CHECK-O3-NEXT: ld.param.u64 %rd3, [test_ldst_v3f32_param_1];
+; CHECK-O3-NEXT: st.f32 [%rd3+8], %f1;
+; CHECK-O3-NEXT: st.u64 [%rd3], %rd2;
+; CHECK-O3-NEXT: ret;
%t1 = load <3 x float>, ptr %a
store <3 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v4f32(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
-; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_ldst_v4f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-O0-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O0-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v4f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-O3-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-O3-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O3-NEXT: ret;
%t1 = load <4 x float>, ptr %a
store <4 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: test_ldst_v8f32(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
-; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
-; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
-; CHECK-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_ldst_v8f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-O0-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O0-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-O0-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
+; CHECK-O0-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v8f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-O3-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O3-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-O3-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
+; CHECK-O3-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O3-NEXT: ret;
%t1 = load <8 x float>, ptr %a
store <8 x float> %t1, ptr %b, align 32
ret void
@@ -700,736 +1482,1335 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_call(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_param_0];
-; CHECK-NEXT: { // callseq 0, 0
-; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
-; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
-; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK-NEXT: (
-; CHECK-NEXT: param0,
-; CHECK-NEXT: param1
-; CHECK-NEXT: );
-; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
-; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_call(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_param_0];
+; CHECK-O0-NEXT: { // callseq 0, 0
+; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O0-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O0-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O0-NEXT: call.uni (retval0),
+; CHECK-O0-NEXT: test_callee,
+; CHECK-O0-NEXT: (
+; CHECK-O0-NEXT: param0,
+; CHECK-O0-NEXT: param1
+; CHECK-O0-NEXT: );
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-O0-NEXT: } // callseq 0
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_call(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_param_1];
+; CHECK-O3-NEXT: { // callseq 0, 0
+; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O3-NEXT: st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O3-NEXT: st.param.v2.f32 [param1], {%f3, %f4};
+; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O3-NEXT: call.uni (retval0),
+; CHECK-O3-NEXT: test_callee,
+; CHECK-O3-NEXT: (
+; CHECK-O3-NEXT: param0,
+; CHECK-O3-NEXT: param1
+; CHECK-O3-NEXT: );
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-O3-NEXT: } // callseq 0
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_call_flipped(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_flipped_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_flipped_param_0];
-; CHECK-NEXT: { // callseq 1, 0
-; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
-; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
-; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK-NEXT: (
-; CHECK-NEXT: param0,
-; CHECK-NEXT: param1
-; CHECK-NEXT: );
-; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
-; CHECK-NEXT: } // callseq 1
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_call_flipped(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_flipped_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_flipped_param_0];
+; CHECK-O0-NEXT: { // callseq 1, 0
+; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O0-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
+; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O0-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
+; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O0-NEXT: call.uni (retval0),
+; CHECK-O0-NEXT: test_callee,
+; CHECK-O0-NEXT: (
+; CHECK-O0-NEXT: param0,
+; CHECK-O0-NEXT: param1
+; CHECK-O0-NEXT: );
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-O0-NEXT: } // callseq 1
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_call_flipped(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_call_flipped_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_call_flipped_param_1];
+; CHECK-O3-NEXT: { // callseq 1, 0
+; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O3-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
+; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O3-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
+; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O3-NEXT: call.uni (retval0),
+; CHECK-O3-NEXT: test_callee,
+; CHECK-O3-NEXT: (
+; CHECK-O3-NEXT: param0,
+; CHECK-O3-NEXT: param1
+; CHECK-O3-NEXT: );
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-O3-NEXT: } // callseq 1
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_tailcall_flipped_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_tailcall_flipped_param_0];
-; CHECK-NEXT: { // callseq 2, 0
-; CHECK-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
-; CHECK-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
-; CHECK-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-NEXT: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK-NEXT: (
-; CHECK-NEXT: param0,
-; CHECK-NEXT: param1
-; CHECK-NEXT: );
-; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
-; CHECK-NEXT: } // callseq 2
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_tailcall_flipped(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_tailcall_flipped_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_tailcall_flipped_param_0];
+; CHECK-O0-NEXT: { // callseq 2, 0
+; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O0-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
+; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O0-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
+; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O0-NEXT: call.uni (retval0),
+; CHECK-O0-NEXT: test_callee,
+; CHECK-O0-NEXT: (
+; CHECK-O0-NEXT: param0,
+; CHECK-O0-NEXT: param1
+; CHECK-O0-NEXT: );
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-O0-NEXT: } // callseq 2
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_tailcall_flipped(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_tailcall_flipped_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_tailcall_flipped_param_1];
+; CHECK-O3-NEXT: { // callseq 2, 0
+; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O3-NEXT: st.param.v2.f32 [param0], {%f3, %f4};
+; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O3-NEXT: st.param.v2.f32 [param1], {%f1, %f2};
+; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O3-NEXT: call.uni (retval0),
+; CHECK-O3-NEXT: test_callee,
+; CHECK-O3-NEXT: (
+; CHECK-O3-NEXT: param0,
+; CHECK-O3-NEXT: param1
+; CHECK-O3-NEXT: );
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [retval0];
+; CHECK-O3-NEXT: } // callseq 2
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: ret;
%r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
-; CHECK-LABEL: test_select(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
-; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_param_0];
-; CHECK-NEXT: selp.f32 %f5, %f2, %f4, %p1;
-; CHECK-NEXT: selp.f32 %f6, %f1, %f3, %p1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_select(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<2>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-O0-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-O0-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_param_0];
+; CHECK-O0-NEXT: selp.f32 %f5, %f2, %f4, %p1;
+; CHECK-O0-NEXT: selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<2>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-O3-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-O3-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_param_1];
+; CHECK-O3-NEXT: selp.f32 %f5, %f2, %f4, %p1;
+; CHECK-O3-NEXT: selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%r = select i1 %c, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
-; CHECK-LABEL: test_select_cc(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .f32 %f<11>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_param_3];
-; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_select_cc_param_2];
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_param_0];
-; CHECK-NEXT: setp.neu.f32 %p1, %f5, %f7;
-; CHECK-NEXT: setp.neu.f32 %p2, %f6, %f8;
-; CHECK-NEXT: selp.f32 %f9, %f2, %f4, %p2;
-; CHECK-NEXT: selp.f32 %f10, %f1, %f3, %p1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_select_cc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<11>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_param_3];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_select_cc_param_2];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_param_0];
+; CHECK-O0-NEXT: setp.neu.f32 %p1, %f5, %f7;
+; CHECK-O0-NEXT: setp.neu.f32 %p2, %f6, %f8;
+; CHECK-O0-NEXT: selp.f32 %f9, %f2, %f4, %p2;
+; CHECK-O0-NEXT: selp.f32 %f10, %f1, %f3, %p1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select_cc(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<11>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_param_2];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_select_cc_param_3];
+; CHECK-O3-NEXT: setp.neu.f32 %p1, %f3, %f5;
+; CHECK-O3-NEXT: setp.neu.f32 %p2, %f4, %f6;
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_param_1];
+; CHECK-O3-NEXT: selp.f32 %f9, %f2, %f8, %p2;
+; CHECK-O3-NEXT: selp.f32 %f10, %f1, %f7, %p1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
+; CHECK-O3-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
-; CHECK-LABEL: test_select_cc_f64_f32(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .f64 %fd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f64_f32_param_3];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f64_f32_param_2];
-; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
-; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
-; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3;
-; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4;
-; CHECK-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
-; CHECK-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
-; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_select_cc_f64_f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .f64 %fd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f64_f32_param_3];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f64_f32_param_2];
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-O0-NEXT: setp.neu.f32 %p1, %f1, %f3;
+; CHECK-O0-NEXT: setp.neu.f32 %p2, %f2, %f4;
+; CHECK-O0-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
+; CHECK-O0-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
+; CHECK-O0-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select_cc_f64_f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .f64 %fd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f64_f32_param_2];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f64_f32_param_3];
+; CHECK-O3-NEXT: setp.neu.f32 %p1, %f1, %f3;
+; CHECK-O3-NEXT: setp.neu.f32 %p2, %f2, %f4;
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-O3-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
+; CHECK-O3-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
+; CHECK-O3-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
+; CHECK-O3-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
ret <2 x double> %r
}
define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
-; CHECK-LABEL: test_select_cc_f32_f64(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .f64 %fd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
-; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f64_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f64_param_0];
-; CHECK-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
-; CHECK-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
-; CHECK-NEXT: selp.f32 %f5, %f2, %f4, %p2;
-; CHECK-NEXT: selp.f32 %f6, %f1, %f3, %p1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_select_cc_f32_f64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .f64 %fd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f64_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f64_param_0];
+; CHECK-O0-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
+; CHECK-O0-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
+; CHECK-O0-NEXT: selp.f32 %f5, %f2, %f4, %p2;
+; CHECK-O0-NEXT: selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select_cc_f32_f64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .f64 %fd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f64_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-O3-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
+; CHECK-O3-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f64_param_1];
+; CHECK-O3-NEXT: selp.f32 %f5, %f2, %f4, %p2;
+; CHECK-O3-NEXT: selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%cc = fcmp une <2 x double> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_une(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_une_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_une_param_0];
-; CHECK-NEXT: setp.neu.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.neu.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_une(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_une_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_une_param_0];
+; CHECK-O0-NEXT: setp.neu.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.neu.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_une(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_une_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_une_param_1];
+; CHECK-O3-NEXT: setp.neu.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.neu.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp une <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ueq_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ueq_param_0];
-; CHECK-NEXT: setp.equ.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.equ.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ueq(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ueq_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ueq_param_0];
+; CHECK-O0-NEXT: setp.equ.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.equ.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ueq(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ueq_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ueq_param_1];
+; CHECK-O3-NEXT: setp.equ.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.equ.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ueq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ugt_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ugt_param_0];
-; CHECK-NEXT: setp.gtu.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.gtu.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ugt(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ugt_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ugt_param_0];
+; CHECK-O0-NEXT: setp.gtu.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.gtu.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ugt(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ugt_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ugt_param_1];
+; CHECK-O3-NEXT: setp.gtu.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.gtu.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ugt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uge_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uge_param_0];
-; CHECK-NEXT: setp.geu.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.geu.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_uge(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uge_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uge_param_0];
+; CHECK-O0-NEXT: setp.geu.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.geu.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_uge(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uge_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uge_param_1];
+; CHECK-O3-NEXT: setp.geu.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.geu.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp uge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ult_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ult_param_0];
-; CHECK-NEXT: setp.ltu.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.ltu.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ult(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ult_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ult_param_0];
+; CHECK-O0-NEXT: setp.ltu.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.ltu.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ult(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ult_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ult_param_1];
+; CHECK-O3-NEXT: setp.ltu.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.ltu.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ult <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ule_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ule_param_0];
-; CHECK-NEXT: setp.leu.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.leu.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ule(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ule_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ule_param_0];
+; CHECK-O0-NEXT: setp.leu.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.leu.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ule(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ule_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ule_param_1];
+; CHECK-O3-NEXT: setp.leu.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.leu.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ule <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uno_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uno_param_0];
-; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.nan.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_uno(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uno_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uno_param_0];
+; CHECK-O0-NEXT: setp.nan.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.nan.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_uno(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_uno_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_uno_param_1];
+; CHECK-O3-NEXT: setp.nan.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.nan.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp uno <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_one(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_one_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_one_param_0];
-; CHECK-NEXT: setp.ne.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.ne.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_one(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_one_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_one_param_0];
+; CHECK-O0-NEXT: setp.ne.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.ne.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_one(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_one_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_one_param_1];
+; CHECK-O3-NEXT: setp.ne.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.ne.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp one <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oeq_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oeq_param_0];
-; CHECK-NEXT: setp.eq.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.eq.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_oeq(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oeq_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oeq_param_0];
+; CHECK-O0-NEXT: setp.eq.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.eq.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_oeq(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oeq_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oeq_param_1];
+; CHECK-O3-NEXT: setp.eq.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.eq.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp oeq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ogt_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ogt_param_0];
-; CHECK-NEXT: setp.gt.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.gt.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ogt(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ogt_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ogt_param_0];
+; CHECK-O0-NEXT: setp.gt.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.gt.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ogt(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ogt_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ogt_param_1];
+; CHECK-O3-NEXT: setp.gt.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.gt.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ogt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oge_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oge_param_0];
-; CHECK-NEXT: setp.ge.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.ge.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_oge(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oge_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oge_param_0];
+; CHECK-O0-NEXT: setp.ge.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.ge.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_oge(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_oge_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_oge_param_1];
+; CHECK-O3-NEXT: setp.ge.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.ge.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp oge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_olt(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_olt_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_olt_param_0];
-; CHECK-NEXT: setp.lt.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.lt.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_olt(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_olt_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_olt_param_0];
+; CHECK-O0-NEXT: setp.lt.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.lt.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_olt(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_olt_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_olt_param_1];
+; CHECK-O3-NEXT: setp.lt.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.lt.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp olt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ole(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ole_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ole_param_0];
-; CHECK-NEXT: setp.le.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.le.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ole(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ole_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ole_param_0];
+; CHECK-O0-NEXT: setp.le.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.le.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ole(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ole_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ole_param_1];
+; CHECK-O3-NEXT: setp.le.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.le.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ole <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ord_param_1];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ord_param_0];
-; CHECK-NEXT: setp.num.f32 %p1, %f2, %f4;
-; CHECK-NEXT: setp.num.f32 %p2, %f1, %f3;
-; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fcmp_ord(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ord_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ord_param_0];
+; CHECK-O0-NEXT: setp.num.f32 %p1, %f2, %f4;
+; CHECK-O0-NEXT: setp.num.f32 %p2, %f1, %f3;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ord(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fcmp_ord_param_0];
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fcmp_ord_param_1];
+; CHECK-O3-NEXT: setp.num.f32 %p1, %f2, %f4;
+; CHECK-O3-NEXT: setp.num.f32 %p2, %f1, %f3;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ord <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i32_param_0];
-; CHECK-NEXT: cvt.rzi.s32.f32 %r1, %f2;
-; CHECK-NEXT: cvt.rzi.s32.f32 %r2, %f1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fptosi_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i32_param_0];
+; CHECK-O0-NEXT: cvt.rzi.s32.f32 %r1, %f2;
+; CHECK-O0-NEXT: cvt.rzi.s32.f32 %r2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptosi_i32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i32_param_0];
+; CHECK-O3-NEXT: cvt.rzi.s32.f32 %r1, %f2;
+; CHECK-O3-NEXT: cvt.rzi.s32.f32 %r2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O3-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i64_param_0];
-; CHECK-NEXT: cvt.rzi.s64.f32 %rd1, %f2;
-; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %f1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fptosi_i64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i64_param_0];
+; CHECK-O0-NEXT: cvt.rzi.s64.f32 %rd1, %f2;
+; CHECK-O0-NEXT: cvt.rzi.s64.f32 %rd2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptosi_i64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptosi_i64_param_0];
+; CHECK-O3-NEXT: cvt.rzi.s64.f32 %rd1, %f2;
+; CHECK-O3-NEXT: cvt.rzi.s64.f32 %rd2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-O3-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptoui_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi32_param_0];
-; CHECK-NEXT: cvt.rzi.u32.f32 %r1, %f2;
-; CHECK-NEXT: cvt.rzi.u32.f32 %r2, %f1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fptoui_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi32_param_0];
+; CHECK-O0-NEXT: cvt.rzi.u32.f32 %r1, %f2;
+; CHECK-O0-NEXT: cvt.rzi.u32.f32 %r2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptoui_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi32_param_0];
+; CHECK-O3-NEXT: cvt.rzi.u32.f32 %r1, %f2;
+; CHECK-O3-NEXT: cvt.rzi.u32.f32 %r2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O3-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fptoui_2xi64(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi64_param_0];
-; CHECK-NEXT: cvt.rzi.u64.f32 %rd1, %f2;
-; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %f1;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fptoui_2xi64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi64_param_0];
+; CHECK-O0-NEXT: cvt.rzi.u64.f32 %rd1, %f2;
+; CHECK-O0-NEXT: cvt.rzi.u64.f32 %rd2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptoui_2xi64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptoui_2xi64_param_0];
+; CHECK-O3-NEXT: cvt.rzi.u64.f32 %rd1, %f2;
+; CHECK-O3-NEXT: cvt.rzi.u64.f32 %rd2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-O3-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_uitofp_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
-; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
-; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_uitofp_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_uitofp_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
%r = uitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
-; CHECK-LABEL: test_uitofp_2xi64(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
-; CHECK-NEXT: cvt.rn.f32.u64 %f1, %rd2;
-; CHECK-NEXT: cvt.rn.f32.u64 %f2, %rd1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_uitofp_2xi64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.u64 %f1, %rd2;
+; CHECK-O0-NEXT: cvt.rn.f32.u64 %f2, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_uitofp_2xi64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.u64 %f1, %rd2;
+; CHECK-O3-NEXT: cvt.rn.f32.u64 %f2, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
%r = uitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_sitofp_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
-; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2;
-; CHECK-NEXT: cvt.rn.f32.s32 %f2, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_sitofp_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.s32 %f1, %r2;
+; CHECK-O0-NEXT: cvt.rn.f32.s32 %f2, %r1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_sitofp_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.s32 %f1, %r2;
+; CHECK-O3-NEXT: cvt.rn.f32.s32 %f2, %r1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
%r = sitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
-; CHECK-LABEL: test_sitofp_2xi64(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
-; CHECK-NEXT: cvt.rn.f32.s64 %f1, %rd2;
-; CHECK-NEXT: cvt.rn.f32.s64 %f2, %rd1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_sitofp_2xi64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.s64 %f1, %rd2;
+; CHECK-O0-NEXT: cvt.rn.f32.s64 %f2, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_sitofp_2xi64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.s64 %f1, %rd2;
+; CHECK-O3-NEXT: cvt.rn.f32.s64 %f2, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
%r = sitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
-; CHECK-LABEL: test_uitofp_2xi32_fadd(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
-; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-NEXT: cvt.rn.f32.u32 %f3, %r1;
-; CHECK-NEXT: mov.b32 %r3, %f3;
-; CHECK-NEXT: cvt.u64.u32 %rd2, %r3;
-; CHECK-NEXT: cvt.rn.f32.u32 %f4, %r2;
-; CHECK-NEXT: mov.b32 %r4, %f4;
-; CHECK-NEXT: cvt.u64.u32 %rd3, %r4;
-; CHECK-NEXT: shl.b64 %rd4, %rd3, 32;
-; CHECK-NEXT: or.b64 %rd5, %rd2, %rd4;
-; CHECK-NEXT: mov.b64 %rd6, {%f1, %f2};
-; CHECK-NEXT: add.rn.f32x2 %rd1, %rd6, %rd5;
-; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-NEXT: mov.b32 %f5, %r6;
-; CHECK-NEXT: mov.b32 %f6, %r5;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r3;
+; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r4;
+; CHECK-O0-NEXT: shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f3, %r1;
+; CHECK-O0-NEXT: mov.b32 %r5, %f3;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd6, %r5;
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f4, %r2;
+; CHECK-O0-NEXT: mov.b32 %r6, %f4;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r6;
+; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
+; CHECK-O0-NEXT: mov.b32 %f5, %r8;
+; CHECK-O0-NEXT: mov.b32 %f6, %r7;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-O3-NEXT: mov.b32 %f5, %r4;
+; CHECK-O3-NEXT: mov.b32 %f6, %r3;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
ret <2 x float> %r
}
define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
-; CHECK-LABEL: test_fptrunc_2xdouble(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
-; CHECK-NEXT: cvt.rn.f32.f64 %f1, %fd2;
-; CHECK-NEXT: cvt.rn.f32.f64 %f2, %fd1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fptrunc_2xdouble(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .f64 %fd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.f64 %f1, %fd2;
+; CHECK-O0-NEXT: cvt.rn.f32.f64 %f2, %fd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptrunc_2xdouble(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .f64 %fd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.f64 %f1, %fd2;
+; CHECK-O3-NEXT: cvt.rn.f32.f64 %f2, %fd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
%r = fptrunc <2 x double> %a to <2 x float>
ret <2 x float> %r
}
define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
-; CHECK-LABEL: test_fpext_2xdouble(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fpext_2xdouble_param_0];
-; CHECK-NEXT: cvt.f64.f32 %fd1, %f2;
-; CHECK-NEXT: cvt.f64.f32 %fd2, %f1;
-; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_fpext_2xdouble(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .f64 %fd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fpext_2xdouble_param_0];
+; CHECK-O0-NEXT: cvt.f64.f32 %fd1, %f2;
+; CHECK-O0-NEXT: cvt.f64.f32 %fd2, %f1;
+; CHECK-O0-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fpext_2xdouble(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .f64 %fd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fpext_2xdouble_param_0];
+; CHECK-O3-NEXT: cvt.f64.f32 %fd1, %f2;
+; CHECK-O3-NEXT: cvt.f64.f32 %fd2, %f1;
+; CHECK-O3-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-O3-NEXT: ret;
%r = fpext <2 x float> %a to <2 x double>
ret <2 x double> %r
}
define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_2xi32_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f1;
-; CHECK-NEXT: mov.b64 %rd1, {%f1, %f2};
-; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f2;
+; CHECK-O0-NEXT: mov.b32 %r2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-O3-NEXT: mov.b32 %r1, %f2;
+; CHECK-O3-NEXT: mov.b32 %r2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O3-NEXT: ret;
%r = bitcast <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
-; CHECK-NEXT: mov.b32 %f1, %r2;
-; CHECK-NEXT: mov.b32 %f2, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-O0-NEXT: mov.b32 %f1, %r2;
+; CHECK-O0-NEXT: mov.b32 %f2, %r1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f1, %f2};
+; CHECK-O3-NEXT: ret;
%r = bitcast <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
-; CHECK-LABEL: test_bitcast_double_to_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.f64 %fd1, [test_bitcast_double_to_2xfloat_param_0];
-; CHECK-NEXT: mov.b64 %rd1, %fd1;
-; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
-; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
-; CHECK-NEXT: mov.b32 %f1, %r2;
-; CHECK-NEXT: mov.b32 %f2, %r1;
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-NEXT: .reg .f64 %fd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %fd1, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-O0-NEXT: mov.b64 %rd1, %fd1;
+; CHECK-O0-NEXT: cvt.u32.u64 %r1, %rd1;
+; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
+; CHECK-O0-NEXT: mov.b32 %f1, %r2;
+; CHECK-O0-NEXT: mov.b32 %f2, %r1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f32 %f1, [test_bitcast_double_to_2xfloat_param_0+4];
+; CHECK-O3-NEXT: ld.param.f32 %f2, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT: ret;
%r = bitcast double %a to <2 x float>
ret <2 x float> %r
}
define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcast_2xfloat_to_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-NEXT: .reg .f64 %fd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_double_param_0];
-; CHECK-NEXT: mov.b32 %r1, %f1;
-; CHECK-NEXT: cvt.u64.u32 %rd1, %r1;
-; CHECK-NEXT: mov.b32 %r2, %f2;
-; CHECK-NEXT: cvt.u64.u32 %rd2, %r2;
-; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
-; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: mov.b64 %fd1, %rd4;
-; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
-; CHECK-NEXT: ret;
+;
+; CHECK-O0-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<5>;
+; CHECK-O0-NEXT: .reg .f64 %fd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd1, %r1;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd3, %rd2, 32;
+; CHECK-O0-NEXT: or.b64 %rd4, %rd1, %rd3;
+; CHECK-O0-NEXT: mov.b64 %fd1, %rd4;
+; CHECK-O0-NEXT: st.param.f64 [func_retval0], %fd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f64 %fd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %fd1, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-O3-NEXT: st.param.f64 [func_retval0], %fd1;
+; CHECK-O3-NEXT: ret;
%r = bitcast <2 x float> %a to double
ret double %r
}
>From 66c352214634289a0449aecf958c5f247ddfa6a9 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Feb 2025 17:44:29 -0800
Subject: [PATCH 17/22] remove unnecessary dag pattern for v2f32 build_vector
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ee6b3db4a195bd3..c0fbe164749acda 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3439,8 +3439,6 @@ def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
(V2I16toI32 $a, $b)>;
def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
(V2I16toI32 $a, $b)>;
-def : Pat<(v2f32 (build_vector f32:$a, f32:$b)),
- (V2F32toI64 $a, $b)>;
def : Pat<(i64 (build_pair (i32 (bitconvert f32:$a)),
(i32 (bitconvert f32:$b)))),
(V2F32toI64 $a, $b)>;
>From cc6715bd8264af7767258b69b4e0fa972c6fc175 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Feb 2025 17:59:47 -0800
Subject: [PATCH 18/22] break packed f32 into two f32 regs, not i32 regs
This enables better code simplification.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 13 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 12 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 +
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 540 +++++++-----------
5 files changed, 225 insertions(+), 345 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ea83ad449c10752..2cad645fd86e245 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -190,8 +190,9 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
SelectI128toV2I64(N);
return;
}
- if (N->getOperand(1).getValueType() == MVT::i64 && N->getNumValues() == 3) {
- SelectI64ToV2I32(N);
+ if (N->getOperand(1).getValueType() == MVT::i64) {
+ // {f32,f32} = mov i64
+ SelectI64ToV2F32(N);
return;
}
break;
@@ -2769,13 +2770,15 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
ReplaceNode(N, Mov);
}
-void NVPTXDAGToDAGISel::SelectI64ToV2I32(SDNode *N) {
+void NVPTXDAGToDAGISel::SelectI64ToV2F32(SDNode *N) {
SDValue Ch = N->getOperand(0);
SDValue Src = N->getOperand(1);
+ assert(N->getValueType(0) == MVT::f32 && N->getValueType(1) == MVT::f32 &&
+ "expected {f32,f32} = CopyFromReg i64");
SDLoc DL(N);
- SDNode *Mov = CurDAG->getMachineNode(NVPTX::I64toV2I32, DL,
- {MVT::i32, MVT::i32, Ch.getValueType()},
+ SDNode *Mov = CurDAG->getMachineNode(NVPTX::I64toV2F32, DL,
+ {MVT::f32, MVT::f32, Ch.getValueType()},
{Src, Ch});
ReplaceNode(N, Mov);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 62e81d250d2f734..703a80f74e90c71 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -91,7 +91,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
- void SelectI64ToV2I32(SDNode *N);
+ void SelectI64ToV2F32(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
void SelectCpAsyncBulkS2G(SDNode *N);
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d5c3a92a395941e..77864e52cedc111 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5638,20 +5638,22 @@ static void ReplaceF32x2Op(SDNode *N, SelectionDAG &DAG,
SDValue Chain = DAG.getEntryNode();
- // break i64 result into two i32 registers for later instructions that may
- // access element #0 or #1. otherwise, this code will be eliminated
+ // break packed result into two f32 registers for later instructions that may
+ // access element #0 or #1
SDValue NewValue = DAG.getNode(Opcode, DL, MVT::i64, NewOps);
MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
Register DestReg = RegInfo.createVirtualRegister(
DAG.getTargetLoweringInfo().getRegClassFor(MVT::i64));
SDValue RegCopy = DAG.getCopyToReg(Chain, DL, DestReg, NewValue);
SDValue Explode = DAG.getNode(ISD::CopyFromReg, DL,
- {MVT::i32, MVT::i32, Chain.getValueType()},
+ {MVT::f32, MVT::f32, Chain.getValueType()},
{RegCopy, DAG.getRegister(DestReg, MVT::i64)});
// cast i64 result of new op back to <2 x float>
Results.push_back(DAG.getBitcast(
- OldResultTy, DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
- {Explode.getValue(0), Explode.getValue(1)})));
+ OldResultTy,
+ DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+ {DAG.getBitcast(MVT::i32, Explode.getValue(0)),
+ DAG.getBitcast(MVT::i32, Explode.getValue(1))})));
}
void NVPTXTargetLowering::ReplaceNodeResults(
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c0fbe164749acda..b0eb9bbbb2456ac 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3387,6 +3387,9 @@ let hasSideEffects = false in {
def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2}}, $s;", []>;
+ def I64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+ (ins Int64Regs:$s),
+ "mov.b64 \t{{$d1, $d2}}, $s;", []>;
def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
(ins Int128Regs:$s),
"mov.b128 \t{{$d1, $d2}}, $s;", []>;
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 148c29517780597..fd330c18510a6ba 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -115,7 +115,7 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
;
; CHECK-O0-LABEL: test_fadd(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -135,15 +135,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r6;
-; CHECK-O0-NEXT: mov.b32 %f6, %r5;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -153,10 +150,8 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r2;
-; CHECK-O3-NEXT: mov.b32 %f6, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
@@ -166,7 +161,7 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
;
; CHECK-O0-LABEL: test_fadd_imm_0(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
; CHECK-O0-NEXT: .reg .f32 %f<5>;
; CHECK-O0-NEXT: .reg .b64 %rd<7>;
; CHECK-O0-EMPTY:
@@ -180,15 +175,12 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
-; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f3, %r4;
-; CHECK-O0-NEXT: mov.b32 %f4, %r3;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_0(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<5>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -197,10 +189,8 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f3, %r2;
-; CHECK-O3-NEXT: mov.b32 %f4, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
@@ -210,7 +200,7 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
;
; CHECK-O0-LABEL: test_fadd_imm_1(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
; CHECK-O0-NEXT: .reg .f32 %f<5>;
; CHECK-O0-NEXT: .reg .b64 %rd<7>;
; CHECK-O0-EMPTY:
@@ -224,15 +214,12 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd6;
-; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f3, %r4;
-; CHECK-O0-NEXT: mov.b32 %f4, %r3;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_1(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<5>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -241,10 +228,8 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f3, %r2;
-; CHECK-O3-NEXT: mov.b32 %f4, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
@@ -254,70 +239,61 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
;
; CHECK-O0-LABEL: test_fadd_v4(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<13>;
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
; CHECK-O0-NEXT: .reg .f32 %f<13>;
; CHECK-O0-NEXT: .reg .b64 %rd<19>;
; CHECK-O0-EMPTY:
; CHECK-O0-NEXT: // %bb.0:
; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
-; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
-; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
-; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
-; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
; CHECK-O0-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
-; CHECK-O0-NEXT: mov.b32 %r5, %f5;
+; CHECK-O0-NEXT: mov.b32 %r5, %f7;
; CHECK-O0-NEXT: cvt.u64.u32 %rd11, %r5;
-; CHECK-O0-NEXT: mov.b32 %r6, %f6;
+; CHECK-O0-NEXT: mov.b32 %r6, %f8;
; CHECK-O0-NEXT: cvt.u64.u32 %rd12, %r6;
; CHECK-O0-NEXT: shl.b64 %rd13, %rd12, 32;
; CHECK-O0-NEXT: or.b64 %rd14, %rd11, %rd13;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd10, %rd14;
-; CHECK-O0-NEXT: mov.b32 %r7, %f7;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd10, %rd14;
+; CHECK-O0-NEXT: mov.b32 %r7, %f5;
; CHECK-O0-NEXT: cvt.u64.u32 %rd15, %r7;
-; CHECK-O0-NEXT: mov.b32 %r8, %f8;
+; CHECK-O0-NEXT: mov.b32 %r8, %f6;
; CHECK-O0-NEXT: cvt.u64.u32 %rd16, %r8;
; CHECK-O0-NEXT: shl.b64 %rd17, %rd16, 32;
; CHECK-O0-NEXT: or.b64 %rd18, %rd15, %rd17;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd6, %rd18;
-; CHECK-O0-NEXT: mov.b64 {%r9, %r10}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%r11, %r12}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f9, %r12;
-; CHECK-O0-NEXT: mov.b32 %f10, %r11;
-; CHECK-O0-NEXT: mov.b32 %f11, %r10;
-; CHECK-O0-NEXT: mov.b32 %f12, %r9;
-; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd6, %rd18;
+; CHECK-O0-NEXT: mov.b64 {%f9, %f10}, %rd1;
+; CHECK-O0-NEXT: mov.b64 {%f11, %f12}, %rd2;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_v4(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
; CHECK-O3-NEXT: .reg .f32 %f<13>;
; CHECK-O3-NEXT: .reg .b64 %rd<7>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f5, %f6};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, {%f7, %f8};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f9, %r4;
-; CHECK-O3-NEXT: mov.b32 %f10, %r3;
-; CHECK-O3-NEXT: mov.b32 %f11, %r2;
-; CHECK-O3-NEXT: mov.b32 %f12, %r1;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f7, %f8};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%f9, %f10}, %rd1;
+; CHECK-O3-NEXT: mov.b64 {%f11, %f12}, %rd2;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -327,58 +303,49 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
;
; CHECK-O0-LABEL: test_fadd_imm_0_v4(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<9>;
; CHECK-O0-NEXT: .reg .b64 %rd<13>;
; CHECK-O0-EMPTY:
; CHECK-O0-NEXT: // %bb.0:
; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
-; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
-; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
-; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
-; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd10, %rd11;
-; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd6, %rd12;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r8;
-; CHECK-O0-NEXT: mov.b32 %f6, %r7;
-; CHECK-O0-NEXT: mov.b32 %f7, %r6;
-; CHECK-O0-NEXT: mov.b32 %f8, %r5;
-; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: mov.b64 %rd11, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_0_v4(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
; CHECK-O3-NEXT: .reg .f32 %f<9>;
; CHECK-O3-NEXT: .reg .b64 %rd<7>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r4;
-; CHECK-O3-NEXT: mov.b32 %f6, %r3;
-; CHECK-O3-NEXT: mov.b32 %f7, %r2;
-; CHECK-O3-NEXT: mov.b32 %f8, %r1;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -388,58 +355,49 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
;
; CHECK-O0-LABEL: test_fadd_imm_1_v4(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<9>;
; CHECK-O0-NEXT: .reg .b64 %rd<13>;
; CHECK-O0-EMPTY:
; CHECK-O0-NEXT: // %bb.0:
; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
-; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
-; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
-; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
-; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd10, %rd11;
-; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd6, %rd12;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r8;
-; CHECK-O0-NEXT: mov.b32 %f6, %r7;
-; CHECK-O0-NEXT: mov.b32 %f7, %r6;
-; CHECK-O0-NEXT: mov.b32 %f8, %r5;
-; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: mov.b64 %rd11, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd2, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_1_v4(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
; CHECK-O3-NEXT: .reg .f32 %f<9>;
; CHECK-O3-NEXT: .reg .b64 %rd<7>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r4;
-; CHECK-O3-NEXT: mov.b32 %f6, %r3;
-; CHECK-O3-NEXT: mov.b32 %f7, %r2;
-; CHECK-O3-NEXT: mov.b32 %f8, %r1;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
@@ -449,7 +407,7 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
;
; CHECK-O0-LABEL: test_fsub(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -469,15 +427,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: sub.rn.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r6;
-; CHECK-O0-NEXT: mov.b32 %f6, %r5;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fsub(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -487,10 +442,8 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: sub.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r2;
-; CHECK-O3-NEXT: mov.b32 %f6, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
@@ -500,7 +453,7 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
;
; CHECK-O0-LABEL: test_fneg(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
; CHECK-O0-NEXT: .reg .f32 %f<5>;
; CHECK-O0-NEXT: .reg .b64 %rd<7>;
; CHECK-O0-EMPTY:
@@ -514,15 +467,12 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
; CHECK-O0-NEXT: mov.b64 %rd6, 0;
; CHECK-O0-NEXT: sub.rn.f32x2 %rd1, %rd6, %rd5;
-; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f3, %r4;
-; CHECK-O0-NEXT: mov.b32 %f4, %r3;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fneg(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<5>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -531,10 +481,8 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 0;
; CHECK-O3-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f3, %r2;
-; CHECK-O3-NEXT: mov.b32 %f4, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
@@ -544,7 +492,7 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
;
; CHECK-O0-LABEL: test_fmul(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -564,15 +512,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: mul.rn.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r6;
-; CHECK-O0-NEXT: mov.b32 %f6, %r5;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fmul(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -582,10 +527,8 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: mul.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r2;
-; CHECK-O3-NEXT: mov.b32 %f6, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
@@ -595,7 +538,7 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
;
; CHECK-O0-LABEL: test_fma(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
; CHECK-O0-NEXT: .reg .f32 %f<9>;
; CHECK-O0-NEXT: .reg .b64 %rd<14>;
; CHECK-O0-EMPTY:
@@ -622,15 +565,12 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
; CHECK-O0-NEXT: shl.b64 %rd12, %rd11, 32;
; CHECK-O0-NEXT: or.b64 %rd13, %rd10, %rd12;
; CHECK-O0-NEXT: fma.rn.f32x2 %rd1, %rd5, %rd9, %rd13;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f7, %r8;
-; CHECK-O0-NEXT: mov.b32 %f8, %r7;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f7, %f8};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fma(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<9>;
; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
@@ -642,10 +582,8 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_param_2];
; CHECK-O3-NEXT: mov.b64 %rd4, {%f5, %f6};
; CHECK-O3-NEXT: fma.rn.f32x2 %rd1, %rd2, %rd3, %rd4;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f7, %r2;
-; CHECK-O3-NEXT: mov.b32 %f8, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f7, %f8};
; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
@@ -735,7 +673,7 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
;
; CHECK-O0-LABEL: test_fadd_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -755,15 +693,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r6;
-; CHECK-O0-NEXT: mov.b32 %f6, %r5;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -773,10 +708,8 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r2;
-; CHECK-O3-NEXT: mov.b32 %f6, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
@@ -786,7 +719,7 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
;
; CHECK-O0-LABEL: test_fadd_imm_0_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
; CHECK-O0-NEXT: .reg .f32 %f<5>;
; CHECK-O0-NEXT: .reg .b64 %rd<7>;
; CHECK-O0-EMPTY:
@@ -800,15 +733,12 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
-; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f3, %r4;
-; CHECK-O0-NEXT: mov.b32 %f4, %r3;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_0_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<5>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -817,10 +747,8 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f3, %r2;
-; CHECK-O3-NEXT: mov.b32 %f4, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
@@ -830,7 +758,7 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
;
; CHECK-O0-LABEL: test_fadd_imm_1_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
; CHECK-O0-NEXT: .reg .f32 %f<5>;
; CHECK-O0-NEXT: .reg .b64 %rd<7>;
; CHECK-O0-EMPTY:
@@ -844,15 +772,12 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
; CHECK-O0-NEXT: mov.b64 %rd6, 4611686019492741120;
; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd5, %rd6;
-; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f3, %r4;
-; CHECK-O0-NEXT: mov.b32 %f4, %r3;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_1_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<5>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -861,10 +786,8 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f3, %r2;
-; CHECK-O3-NEXT: mov.b32 %f4, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
@@ -874,70 +797,61 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
;
; CHECK-O0-LABEL: test_fadd_v4_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<13>;
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
; CHECK-O0-NEXT: .reg .f32 %f<13>;
; CHECK-O0-NEXT: .reg .b64 %rd<19>;
; CHECK-O0-EMPTY:
; CHECK-O0-NEXT: // %bb.0:
; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
-; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
-; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
-; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
-; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
; CHECK-O0-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
-; CHECK-O0-NEXT: mov.b32 %r5, %f5;
+; CHECK-O0-NEXT: mov.b32 %r5, %f7;
; CHECK-O0-NEXT: cvt.u64.u32 %rd11, %r5;
-; CHECK-O0-NEXT: mov.b32 %r6, %f6;
+; CHECK-O0-NEXT: mov.b32 %r6, %f8;
; CHECK-O0-NEXT: cvt.u64.u32 %rd12, %r6;
; CHECK-O0-NEXT: shl.b64 %rd13, %rd12, 32;
; CHECK-O0-NEXT: or.b64 %rd14, %rd11, %rd13;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd10, %rd14;
-; CHECK-O0-NEXT: mov.b32 %r7, %f7;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd10, %rd14;
+; CHECK-O0-NEXT: mov.b32 %r7, %f5;
; CHECK-O0-NEXT: cvt.u64.u32 %rd15, %r7;
-; CHECK-O0-NEXT: mov.b32 %r8, %f8;
+; CHECK-O0-NEXT: mov.b32 %r8, %f6;
; CHECK-O0-NEXT: cvt.u64.u32 %rd16, %r8;
; CHECK-O0-NEXT: shl.b64 %rd17, %rd16, 32;
; CHECK-O0-NEXT: or.b64 %rd18, %rd15, %rd17;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd18;
-; CHECK-O0-NEXT: mov.b64 {%r9, %r10}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%r11, %r12}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f9, %r12;
-; CHECK-O0-NEXT: mov.b32 %f10, %r11;
-; CHECK-O0-NEXT: mov.b32 %f11, %r10;
-; CHECK-O0-NEXT: mov.b32 %f12, %r9;
-; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd6, %rd18;
+; CHECK-O0-NEXT: mov.b64 {%f9, %f10}, %rd1;
+; CHECK-O0-NEXT: mov.b64 {%f11, %f12}, %rd2;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_v4_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
; CHECK-O3-NEXT: .reg .f32 %f<13>;
; CHECK-O3-NEXT: .reg .b64 %rd<7>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f5, %f6};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, {%f7, %f8};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f9, %r4;
-; CHECK-O3-NEXT: mov.b32 %f10, %r3;
-; CHECK-O3-NEXT: mov.b32 %f11, %r2;
-; CHECK-O3-NEXT: mov.b32 %f12, %r1;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f12, %f11, %f10, %f9};
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f7, %f8};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%f9, %f10}, %rd1;
+; CHECK-O3-NEXT: mov.b64 {%f11, %f12}, %rd2;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -947,58 +861,49 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
;
; CHECK-O0-LABEL: test_fadd_imm_0_v4_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<9>;
; CHECK-O0-NEXT: .reg .b64 %rd<13>;
; CHECK-O0-EMPTY:
; CHECK-O0-NEXT: // %bb.0:
; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
-; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
-; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
-; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
-; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd10, %rd11;
-; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd12;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r8;
-; CHECK-O0-NEXT: mov.b32 %f6, %r7;
-; CHECK-O0-NEXT: mov.b32 %f7, %r6;
-; CHECK-O0-NEXT: mov.b32 %f8, %r5;
-; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: mov.b64 %rd11, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_0_v4_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
; CHECK-O3-NEXT: .reg .f32 %f<9>;
; CHECK-O3-NEXT: .reg .b64 %rd<7>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r4;
-; CHECK-O3-NEXT: mov.b32 %f6, %r3;
-; CHECK-O3-NEXT: mov.b32 %f7, %r2;
-; CHECK-O3-NEXT: mov.b32 %f8, %r1;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -1008,58 +913,49 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
;
; CHECK-O0-LABEL: test_fadd_imm_1_v4_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<9>;
; CHECK-O0-NEXT: .reg .b64 %rd<13>;
; CHECK-O0-EMPTY:
; CHECK-O0-NEXT: // %bb.0:
; CHECK-O0-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-O0-NEXT: mov.b32 %r1, %f3;
+; CHECK-O0-NEXT: mov.b32 %r1, %f1;
; CHECK-O0-NEXT: cvt.u64.u32 %rd3, %r1;
-; CHECK-O0-NEXT: mov.b32 %r2, %f4;
+; CHECK-O0-NEXT: mov.b32 %r2, %f2;
; CHECK-O0-NEXT: cvt.u64.u32 %rd4, %r2;
; CHECK-O0-NEXT: shl.b64 %rd5, %rd4, 32;
; CHECK-O0-NEXT: or.b64 %rd6, %rd3, %rd5;
-; CHECK-O0-NEXT: mov.b32 %r3, %f1;
+; CHECK-O0-NEXT: mov.b32 %r3, %f3;
; CHECK-O0-NEXT: cvt.u64.u32 %rd7, %r3;
-; CHECK-O0-NEXT: mov.b32 %r4, %f2;
+; CHECK-O0-NEXT: mov.b32 %r4, %f4;
; CHECK-O0-NEXT: cvt.u64.u32 %rd8, %r4;
; CHECK-O0-NEXT: shl.b64 %rd9, %rd8, 32;
; CHECK-O0-NEXT: or.b64 %rd10, %rd7, %rd9;
-; CHECK-O0-NEXT: mov.b64 %rd11, 4611686019492741120;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd10, %rd11;
-; CHECK-O0-NEXT: mov.b64 %rd12, 4647714816524288000;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd6, %rd12;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r8;
-; CHECK-O0-NEXT: mov.b32 %f6, %r7;
-; CHECK-O0-NEXT: mov.b32 %f7, %r6;
-; CHECK-O0-NEXT: mov.b32 %f8, %r5;
-; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O0-NEXT: mov.b64 %rd11, 4647714816524288000;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd1, %rd10, %rd11;
+; CHECK-O0-NEXT: mov.b64 %rd12, 4611686019492741120;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd2, %rd6, %rd12;
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O0-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fadd_imm_1_v4_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
; CHECK-O3-NEXT: .reg .f32 %f<9>;
; CHECK-O3-NEXT: .reg .b64 %rd<7>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mov.b64 %rd4, {%f1, %f2};
-; CHECK-O3-NEXT: mov.b64 %rd5, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r4;
-; CHECK-O3-NEXT: mov.b32 %f6, %r3;
-; CHECK-O3-NEXT: mov.b32 %f7, %r2;
-; CHECK-O3-NEXT: mov.b32 %f8, %r1;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
+; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
+; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd6;
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
+; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
@@ -1069,7 +965,7 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
;
; CHECK-O0-LABEL: test_fsub_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -1089,15 +985,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r6;
-; CHECK-O0-NEXT: mov.b32 %f6, %r5;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fsub_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -1107,10 +1000,8 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r2;
-; CHECK-O3-NEXT: mov.b32 %f6, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
@@ -1120,7 +1011,7 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
;
; CHECK-O0-LABEL: test_fneg_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<5>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
; CHECK-O0-NEXT: .reg .f32 %f<5>;
; CHECK-O0-NEXT: .reg .b64 %rd<7>;
; CHECK-O0-EMPTY:
@@ -1134,15 +1025,12 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
; CHECK-O0-NEXT: or.b64 %rd5, %rd2, %rd4;
; CHECK-O0-NEXT: mov.b64 %rd6, 0;
; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd1, %rd6, %rd5;
-; CHECK-O0-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f3, %r4;
-; CHECK-O0-NEXT: mov.b32 %f4, %r3;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fneg_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<5>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -1151,10 +1039,8 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 0;
; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f3, %r2;
-; CHECK-O3-NEXT: mov.b32 %f4, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
@@ -1164,7 +1050,7 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
;
; CHECK-O0-LABEL: test_fmul_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<7>;
+; CHECK-O0-NEXT: .reg .b32 %r<5>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -1184,15 +1070,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: mul.rn.ftz.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r5, %r6}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r6;
-; CHECK-O0-NEXT: mov.b32 %f6, %r5;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fmul_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -1202,10 +1085,8 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: mul.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r2;
-; CHECK-O3-NEXT: mov.b32 %f6, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
@@ -1215,7 +1096,7 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
;
; CHECK-O0-LABEL: test_fma_ftz(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
; CHECK-O0-NEXT: .reg .f32 %f<9>;
; CHECK-O0-NEXT: .reg .b64 %rd<14>;
; CHECK-O0-EMPTY:
@@ -1242,15 +1123,12 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
; CHECK-O0-NEXT: shl.b64 %rd12, %rd11, 32;
; CHECK-O0-NEXT: or.b64 %rd13, %rd10, %rd12;
; CHECK-O0-NEXT: fma.rn.ftz.f32x2 %rd1, %rd5, %rd9, %rd13;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f7, %r8;
-; CHECK-O0-NEXT: mov.b32 %f8, %r7;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f7, %f8};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_fma_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<9>;
; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
@@ -1262,10 +1140,8 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_ftz_param_2];
; CHECK-O3-NEXT: mov.b64 %rd4, {%f5, %f6};
; CHECK-O3-NEXT: fma.rn.ftz.f32x2 %rd1, %rd2, %rd3, %rd4;
-; CHECK-O3-NEXT: mov.b64 {%r1, %r2}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f7, %r2;
-; CHECK-O3-NEXT: mov.b32 %f8, %r1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f8, %f7};
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f7, %f8};
; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
@@ -2585,7 +2461,7 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
;
; CHECK-O0-LABEL: test_uitofp_2xi32_fadd(
; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-NEXT: .reg .b32 %r<7>;
; CHECK-O0-NEXT: .reg .f32 %f<7>;
; CHECK-O0-NEXT: .reg .b64 %rd<10>;
; CHECK-O0-EMPTY:
@@ -2607,15 +2483,13 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-O0-NEXT: shl.b64 %rd8, %rd7, 32;
; CHECK-O0-NEXT: or.b64 %rd9, %rd6, %rd8;
; CHECK-O0-NEXT: add.rn.f32x2 %rd1, %rd5, %rd9;
-; CHECK-O0-NEXT: mov.b64 {%r7, %r8}, %rd1;
-; CHECK-O0-NEXT: mov.b32 %f5, %r8;
-; CHECK-O0-NEXT: mov.b32 %f6, %r7;
-; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O0-NEXT: ret;
;
; CHECK-O3-LABEL: test_uitofp_2xi32_fadd(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<5>;
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
; CHECK-O3-NEXT: .reg .f32 %f<7>;
; CHECK-O3-NEXT: .reg .b64 %rd<4>;
; CHECK-O3-EMPTY:
@@ -2627,10 +2501,8 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_uitofp_2xi32_fadd_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%r3, %r4}, %rd1;
-; CHECK-O3-NEXT: mov.b32 %f5, %r4;
-; CHECK-O3-NEXT: mov.b32 %f6, %r3;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
; CHECK-O3-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
>From 8de5c5d4bc6bb63cd4a11e41857ae43a299f2c9c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 7 Feb 2025 12:52:02 -0800
Subject: [PATCH 19/22] simplify tablegen defs for packed f32 insns
---
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 55 +++++++++---------------
1 file changed, 20 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0cf2b024fbba33d..47e5b667b892132 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1582,49 +1582,34 @@ def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
// packed f32 ops (sm_100+)
-class F32x2Op2<string OpcStr, Predicate Pred>
+
+def fadd32x2_nvptx : SDNode<"NVPTXISD::FADD_F32X2", SDTIntBinOp>;
+def fsub32x2_nvptx : SDNode<"NVPTXISD::FSUB_F32X2", SDTIntBinOp>;
+def fmul32x2_nvptx : SDNode<"NVPTXISD::FMUL_F32X2", SDTIntBinOp>;
+def fma32x2_nvptx : SDNode<"NVPTXISD::FMA_F32X2", SDTIntTernaryOp>;
+
+class F32x2Op2<string OpcStr, SDNode Op, Predicate Pred>
: NVPTXInst<(outs Int64Regs:$res),
(ins Int64Regs:$a, Int64Regs:$b),
- OpcStr # ".f32x2 \t$res, $a, $b;", []>,
+ OpcStr # ".f32x2 \t$res, $a, $b;",
+ [(set i64:$res, (Op i64:$a, i64:$b))]>,
Requires<[hasF32x2Instructions, Pred]>;
-class F32x2Op3<string OpcStr, Predicate Pred>
+class F32x2Op3<string OpcStr, SDNode Op, Predicate Pred>
: NVPTXInst<(outs Int64Regs:$res),
(ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
- OpcStr # ".f32x2 \t$res, $a, $b, $c;", []>,
+ OpcStr # ".f32x2 \t$res, $a, $b, $c;",
+ [(set i64:$res, (Op i64:$a, i64:$b, i64:$c))]>,
Requires<[hasF32x2Instructions, Pred]>;
-def fadd32x2_nvptx : SDNode<"NVPTXISD::FADD_F32X2", SDTIntBinOp>;
-def fsub32x2_nvptx : SDNode<"NVPTXISD::FSUB_F32X2", SDTIntBinOp>;
-def fmul32x2_nvptx : SDNode<"NVPTXISD::FMUL_F32X2", SDTIntBinOp>;
-def fma32x2_nvptx : SDNode<"NVPTXISD::FMA_F32X2", SDTIntTernaryOp>;
+def FADD32x2 : F32x2Op2<"add.rn", fadd32x2_nvptx, doNoF32FTZ>;
+def FSUB32x2 : F32x2Op2<"sub.rn", fsub32x2_nvptx, doNoF32FTZ>;
+def FMUL32x2 : F32x2Op2<"mul.rn", fmul32x2_nvptx, doNoF32FTZ>;
+def FMA32x2 : F32x2Op3<"fma.rn", fma32x2_nvptx, doNoF32FTZ>;
-def FADD32x2 : F32x2Op2<"add.rn", doNoF32FTZ>;
-def FSUB32x2 : F32x2Op2<"sub.rn", doNoF32FTZ>;
-def FMUL32x2 : F32x2Op2<"mul.rn", doNoF32FTZ>;
-def FMA32x2 : F32x2Op3<"fma.rn", doNoF32FTZ>;
-
-def : Pat<(fadd32x2_nvptx i64:$a, i64:$b),
- (FADD32x2 $a, $b)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fsub32x2_nvptx i64:$a, i64:$b),
- (FSUB32x2 $a, $b)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fmul32x2_nvptx i64:$a, i64:$b),
- (FMUL32x2 $a, $b)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fma32x2_nvptx i64:$a, i64:$b, i64:$c),
- (FMA32x2 $a, $b, $c)>, Requires<[doNoF32FTZ]>;
-
-def FADD32x2_ftz : F32x2Op2<"add.rn.ftz", doF32FTZ>;
-def FSUB32x2_ftz : F32x2Op2<"sub.rn.ftz", doF32FTZ>;
-def FMUL32x2_ftz : F32x2Op2<"mul.rn.ftz", doF32FTZ>;
-def FMA32x2_ftz : F32x2Op3<"fma.rn.ftz", doF32FTZ>;
-
-def : Pat<(fadd32x2_nvptx i64:$a, i64:$b),
- (FADD32x2_ftz $a, $b)>, Requires<[doF32FTZ]>;
-def : Pat<(fsub32x2_nvptx i64:$a, i64:$b),
- (FSUB32x2_ftz $a, $b)>, Requires<[doF32FTZ]>;
-def : Pat<(fmul32x2_nvptx i64:$a, i64:$b),
- (FMUL32x2_ftz $a, $b)>, Requires<[doF32FTZ]>;
-def : Pat<(fma32x2_nvptx i64:$a, i64:$b, i64:$c),
- (FMA32x2_ftz $a, $b, $c)>, Requires<[doF32FTZ]>;
+def FADD32x2_ftz : F32x2Op2<"add.rn.ftz", fadd32x2_nvptx, doF32FTZ>;
+def FSUB32x2_ftz : F32x2Op2<"sub.rn.ftz", fsub32x2_nvptx, doF32FTZ>;
+def FMUL32x2_ftz : F32x2Op2<"mul.rn.ftz", fmul32x2_nvptx, doF32FTZ>;
+def FMA32x2_ftz : F32x2Op3<"fma.rn.ftz", fma32x2_nvptx, doF32FTZ>;
//
// BFIND
>From d85f96fcaf2d59069782a2a06127392a54400283 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 7 Feb 2025 19:10:47 -0800
Subject: [PATCH 20/22] add combine rule to simplify vector stores
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 97 +++++++-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 219 ++++++++----------
2 files changed, 186 insertions(+), 130 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 77864e52cedc111..92f0bffcecbfecf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4619,26 +4619,109 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+// If {Lo, Hi} = <packed f32x2 val>, returns that value
+static SDValue peekThroughF32x2Copy(const SDValue &Lo, const SDValue &Hi) {
+ if (Lo.getValueType() != MVT::f32 || Lo.getOpcode() != ISD::CopyFromReg ||
+ Lo.getNode() != Hi.getNode() || Lo == Hi)
+ return SDValue();
+
+ SDNode *CopyF = Lo.getNode();
+ SDNode *CopyT = CopyF->getOperand(0).getNode();
+ if (CopyT->getOpcode() != ISD::CopyToReg)
+ return SDValue();
+
+ // check the two registers are the same
+ if (cast<RegisterSDNode>(CopyF->getOperand(1))->getReg() !=
+ cast<RegisterSDNode>(CopyT->getOperand(1))->getReg())
+ return SDValue();
+
+ SDValue OrigV = CopyT->getOperand(2);
+ if (OrigV.getValueType() != MVT::i64)
+ return SDValue();
+ return OrigV;
+}
+
+static SDValue
+PerformPackedF32StoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ // rewrite stores of packed f32 values
+ auto *MemN = cast<MemSDNode>(N);
+ if (MemN->getMemoryVT() == MVT::f32) {
+ std::optional<NVPTXISD::NodeType> NewOpcode;
+ switch (MemN->getOpcode()) {
+ case NVPTXISD::StoreRetvalV2:
+ NewOpcode = NVPTXISD::StoreRetval;
+ break;
+ case NVPTXISD::StoreRetvalV4:
+ NewOpcode = NVPTXISD::StoreRetvalV2;
+ break;
+ case NVPTXISD::StoreParamV2:
+ NewOpcode = NVPTXISD::StoreParam;
+ break;
+ case NVPTXISD::StoreParamV4:
+ NewOpcode = NVPTXISD::StoreParamV2;
+ break;
+ }
+
+ if (NewOpcode) {
+ SmallVector<SDValue> NewOps = {N->getOperand(0), N->getOperand(1)};
+ unsigned NumPacked = 0;
+
+ // gather all packed operands
+ for (unsigned I = 2, E = MemN->getNumOperands(); I < E; I += 2) {
+ if (SDValue Packed = peekThroughF32x2Copy(MemN->getOperand(I),
+ MemN->getOperand(I + 1))) {
+ NewOps.push_back(Packed);
+ ++NumPacked;
+ } else {
+ NumPacked = 0;
+ break;
+ }
+ }
+
+ if (NumPacked) {
+ return DCI.DAG.getMemIntrinsicNode(
+ *NewOpcode, SDLoc(N), N->getVTList(), NewOps, MVT::i64,
+ MemN->getPointerInfo(), MemN->getAlign(),
+ MachineMemOperand::MOStore);
+ }
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
- std::size_t Back) {
+ std::size_t Back,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
if (all_of(N->ops().drop_front(Front).drop_back(Back),
[](const SDUse &U) { return U.get()->isUndef(); }))
// Operand 0 is the previous value in the chain. Cannot return EntryToken
// as the previous value will become unused and eliminated later.
return N->getOperand(0);
+ if (SDValue V = PerformPackedF32StoreCombine(N, DCI, OptLevel))
+ return V;
+
return SDValue();
}
-static SDValue PerformStoreParamCombine(SDNode *N) {
+static SDValue PerformStoreParamCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
// Operands from the 3rd to the 2nd last one are the values to be stored.
// {Chain, ArgID, Offset, Val, Glue}
- return PerformStoreCombineHelper(N, 3, 1);
+ return PerformStoreCombineHelper(N, 3, 1, DCI, OptLevel);
}
-static SDValue PerformStoreRetvalCombine(SDNode *N) {
+static SDValue PerformStoreRetvalCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
// Operands from the 2nd to the last one are the values to be stored
- return PerformStoreCombineHelper(N, 2, 0);
+ return PerformStoreCombineHelper(N, 2, 0, DCI, OptLevel);
}
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
@@ -5329,11 +5412,11 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
- return PerformStoreRetvalCombine(N);
+ return PerformStoreRetvalCombine(N, DCI, OptLevel);
case NVPTXISD::StoreParam:
case NVPTXISD::StoreParamV2:
case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N);
+ return PerformStoreParamCombine(N, DCI, OptLevel);
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACTCombine(N, DCI);
case ISD::VSELECT:
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index fd330c18510a6ba..7160d3483e172d4 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -141,17 +141,16 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
;
; CHECK-O3-LABEL: test_fadd(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
@@ -181,16 +180,15 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
;
; CHECK-O3-LABEL: test_fadd_imm_0(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
@@ -220,16 +218,15 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
;
; CHECK-O3-LABEL: test_fadd_imm_1(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
@@ -279,8 +276,8 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
;
; CHECK-O3-LABEL: test_fadd_v4(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<13>;
-; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_param_0];
@@ -288,12 +285,10 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_param_1];
; CHECK-O3-NEXT: mov.b64 %rd5, {%f7, %f8};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, {%f5, %f6};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%f9, %f10}, %rd1;
-; CHECK-O3-NEXT: mov.b64 {%f11, %f12}, %rd2;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd7, {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -332,20 +327,18 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
;
; CHECK-O3-LABEL: test_fadd_imm_0_v4(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_param_0];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd7, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -384,20 +377,18 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
;
; CHECK-O3-LABEL: test_fadd_imm_1_v4(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_param_0];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd2, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd7, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
@@ -433,17 +424,16 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
;
; CHECK-O3-LABEL: test_fsub(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: sub.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: sub.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
@@ -473,16 +463,15 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
;
; CHECK-O3-LABEL: test_fneg(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 0;
-; CHECK-O3-NEXT: sub.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-O3-NEXT: sub.rn.f32x2 %rd4, %rd3, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
@@ -518,17 +507,16 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
;
; CHECK-O3-LABEL: test_fmul(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mul.rn.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
@@ -571,8 +559,8 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
;
; CHECK-O3-LABEL: test_fma(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<6>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_param_0];
@@ -581,9 +569,8 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_param_2];
; CHECK-O3-NEXT: mov.b64 %rd4, {%f5, %f6};
-; CHECK-O3-NEXT: fma.rn.f32x2 %rd1, %rd2, %rd3, %rd4;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f7, %f8};
+; CHECK-O3-NEXT: fma.rn.f32x2 %rd5, %rd2, %rd3, %rd4;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd5;
; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
@@ -699,17 +686,16 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
;
; CHECK-O3-LABEL: test_fadd_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fadd_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
@@ -739,16 +725,15 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
;
; CHECK-O3-LABEL: test_fadd_imm_0_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_0_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
@@ -778,16 +763,15 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
;
; CHECK-O3-LABEL: test_fadd_imm_1_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fadd_imm_1_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
@@ -837,8 +821,8 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
;
; CHECK-O3-LABEL: test_fadd_v4_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<13>;
-; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_v4_ftz_param_0];
@@ -846,12 +830,10 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [test_fadd_v4_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd5, {%f7, %f8};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, {%f5, %f6};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%f9, %f10}, %rd1;
-; CHECK-O3-NEXT: mov.b64 {%f11, %f12}, %rd2;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f11, %f12, %f9, %f10};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd7, {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -890,20 +872,18 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
;
; CHECK-O3-LABEL: test_fadd_imm_0_v4_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_0_v4_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd7, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -942,20 +922,18 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
;
; CHECK-O3-LABEL: test_fadd_imm_1_v4_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<7>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [test_fadd_imm_1_v4_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd4, {%f3, %f4};
; CHECK-O3-NEXT: mov.b64 %rd5, 4647714816524288000;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd1, %rd4, %rd5;
-; CHECK-O3-NEXT: mov.b64 %rd6, 4611686019492741120;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd2, %rd3, %rd6;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd2;
-; CHECK-O3-NEXT: st.param.v4.f32 [func_retval0], {%f7, %f8, %f5, %f6};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O3-NEXT: mov.b64 %rd7, 4611686019492741120;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
@@ -991,17 +969,16 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
;
; CHECK-O3-LABEL: test_fsub_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fsub_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fsub_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
@@ -1031,16 +1008,15 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
;
; CHECK-O3-LABEL: test_fneg_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fneg_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: mov.b64 %rd3, 0;
-; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd1, %rd3, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f3, %f4};
+; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd4, %rd3, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
@@ -1076,17 +1052,16 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
;
; CHECK-O3-LABEL: test_fmul_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fmul_ftz_param_0];
; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f2};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_fmul_ftz_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: mul.rn.ftz.f32x2 %rd1, %rd2, %rd3;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: mul.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
@@ -1129,8 +1104,8 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
;
; CHECK-O3-LABEL: test_fma_ftz(
; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<6>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fma_ftz_param_0];
@@ -1139,9 +1114,8 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f5, %f6}, [test_fma_ftz_param_2];
; CHECK-O3-NEXT: mov.b64 %rd4, {%f5, %f6};
-; CHECK-O3-NEXT: fma.rn.ftz.f32x2 %rd1, %rd2, %rd3, %rd4;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f7, %f8};
+; CHECK-O3-NEXT: fma.rn.ftz.f32x2 %rd5, %rd2, %rd3, %rd4;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd5;
; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
@@ -2490,8 +2464,8 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-O3-LABEL: test_uitofp_2xi32_fadd(
; CHECK-O3: {
; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
; CHECK-O3-EMPTY:
; CHECK-O3-NEXT: // %bb.0:
; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
@@ -2500,9 +2474,8 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
; CHECK-O3-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_uitofp_2xi32_fadd_param_1];
; CHECK-O3-NEXT: mov.b64 %rd3, {%f3, %f4};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd1, %rd3, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd1;
-; CHECK-O3-NEXT: st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd4, %rd3, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-O3-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
>From 8b3c62d9f9b66fb13c53c3cd7543f864335c77cc Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 7 Feb 2025 19:51:13 -0800
Subject: [PATCH 21/22] [NVPTX] remove unused param in ReplaceF32x2Op()
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 92f0bffcecbfecf..1e417f23fdb099e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5686,8 +5686,7 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
}
static void ReplaceF32x2Op(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results,
- bool UseFTZ) {
+ SmallVectorImpl<SDValue> &Results) {
SDLoc DL(N);
EVT OldResultTy = N->getValueType(0); // <2 x float>
assert(OldResultTy == MVT::v2f32 && "Unexpected result type for F32x2 op!");
@@ -5760,7 +5759,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case ISD::FSUB:
case ISD::FMUL:
case ISD::FMA:
- ReplaceF32x2Op(N, DAG, Results, useF32FTZ(DAG.getMachineFunction()));
+ ReplaceF32x2Op(N, DAG, Results);
return;
}
}
>From 1e97b9a94b4a49a5dbcc1f4fed30f7265c036407 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 7 Feb 2025 21:03:27 -0800
Subject: [PATCH 22/22] [NVPTX] strengthen check for CopyFromReg in
ISelDAGToDAG
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2cad645fd86e245..3a39f6dab0c85fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -190,7 +190,8 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
SelectI128toV2I64(N);
return;
}
- if (N->getOperand(1).getValueType() == MVT::i64) {
+ if (N->getOperand(1).getValueType() == MVT::i64 &&
+ N->getValueType(0) == MVT::f32 && N->getValueType(1) == MVT::f32) {
// {f32,f32} = mov i64
SelectI64ToV2F32(N);
return;
More information about the llvm-commits
mailing list