[llvm] e05bad4 - [NVPTX] legalize v2i32 to improve compatibility with v2f32 (#153478)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 26 13:52:15 PDT 2025
Author: Princeton Ferro
Date: 2025-09-26T13:52:11-07:00
New Revision: e05bad472d5fc572183dd6498a9b8a1b568ce9be
URL: https://github.com/llvm/llvm-project/commit/e05bad472d5fc572183dd6498a9b8a1b568ce9be
DIFF: https://github.com/llvm/llvm-project/commit/e05bad472d5fc572183dd6498a9b8a1b568ce9be.diff
LOG: [NVPTX] legalize v2i32 to improve compatibility with v2f32 (#153478)
Since v2f32 is legal but v2i32 is not, this causes some sequences of
operations like bitcast (build_vector) to be lowered inefficiently.
Added:
llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
Modified:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
llvm/lib/Target/NVPTX/NVPTXUtilities.h
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index e86abb7203f2b..bef4868492d4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1027,6 +1027,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
case MVT::f32:
return Opcode_i32;
case MVT::v2f32:
+ case MVT::v2i32:
case MVT::i64:
case MVT::f64:
return Opcode_i64;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 68935784128c0..3ac7c2874408b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -226,21 +226,20 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
switch (VectorVT.SimpleTy) {
default:
return std::nullopt;
+
case MVT::v4i64:
case MVT::v4f64:
- case MVT::v8i32:
- // This is a "native" vector type iff the address space is global
- // and the target supports 256-bit loads/stores
+ // This is a "native" vector type iff the address space is global and the
+ // target supports 256-bit loads/stores
if (!CanLowerTo256Bit)
return std::nullopt;
LLVM_FALLTHROUGH;
case MVT::v2i8:
- case MVT::v2i32:
case MVT::v2i64:
case MVT::v2f64:
- case MVT::v4i32:
// This is a "native" vector type
return std::pair(NumElts, EltVT);
+
case MVT::v16f16: // <8 x f16x2>
case MVT::v16bf16: // <8 x bf16x2>
case MVT::v16i16: // <8 x i16x2>
@@ -264,12 +263,18 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
case MVT::v16i8: // <4 x i8x4>
PackRegSize = 32;
break;
+
case MVT::v8f32: // <4 x f32x2>
+ case MVT::v8i32: // <4 x i32x2>
+ // This is a "native" vector type iff the address space is global and the
+ // target supports 256-bit loads/stores
if (!CanLowerTo256Bit)
return std::nullopt;
LLVM_FALLTHROUGH;
case MVT::v2f32: // <1 x f32x2>
case MVT::v4f32: // <2 x f32x2>
+ case MVT::v2i32: // <1 x i32x2>
+ case MVT::v4i32: // <2 x i32x2>
if (!STI.hasF32x2Instructions())
return std::pair(NumElts, EltVT);
PackRegSize = 64;
@@ -590,8 +595,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
- if (STI.hasF32x2Instructions())
+ if (STI.hasF32x2Instructions()) {
addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
+ addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
+ }
// Conversion to/from FP16/FP16x2 is always legal.
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -628,12 +635,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
- // No support for these operations with v2f32.
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
+ // No support for these operations with v2f32/v2i32
+ setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
// Need custom lowering in case the index is dynamic.
if (STI.hasF32x2Instructions())
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
+ Custom);
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
@@ -661,14 +669,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Operations not directly supported by NVPTX.
for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
- MVT::v4i8, MVT::i32, MVT::i64}) {
+ MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::BR_CC, VT, Expand);
}
- // Not directly supported. TLI would attempt to expand operations like
- // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes.
- setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
+ // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
+ setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
// For others we will expand to a SHL/SRA pair.
@@ -815,7 +822,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC},
- MVT::v2i16, Expand);
+ {MVT::v2i16, MVT::v2i32}, Expand);
+
+ // v2i32 is not supported for any arithmetic operations
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
+ ISD::CTPOP, ISD::CTLZ, ISD::ADD, ISD::SUB, ISD::MUL,
+ ISD::SHL, ISD::SRA, ISD::SRL, ISD::OR, ISD::AND, ISD::XOR,
+ ISD::SREM, ISD::UREM},
+ MVT::v2i32, Expand);
setOperationAction(ISD::ADDC, MVT::i32, Legal);
setOperationAction(ISD::ADDE, MVT::i32, Legal);
@@ -829,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
}
setOperationAction(ISD::CTTZ, MVT::i16, Expand);
- setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
+ setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
setOperationAction(ISD::CTTZ, MVT::i32, Expand);
setOperationAction(ISD::CTTZ, MVT::i64, Expand);
@@ -1071,7 +1085,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom lowering for tcgen05.st vector operands
setOperationAction(ISD::INTRINSIC_VOID,
{MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
- MVT::v32i32, MVT::v64i32, MVT::v128i32},
+ MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
Custom);
// Enable custom lowering for the following:
@@ -2604,7 +2618,7 @@ static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
return V;
}
-static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG) {
+static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
SDLoc DL(N);
SmallVector<SDValue, 32> Ops;
@@ -2719,7 +2733,52 @@ static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG) {
return Tcgen05MMANode;
}
-static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
+// Lower vector return type of tcgen05.ld intrinsics
+static std::optional<std::pair<SDValue, SDValue>>
+lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
+ SDLoc DL(N);
+ EVT ResVT = N->getValueType(0);
+ if (!ResVT.isVector())
+ return {}; // already legalized.
+
+ const unsigned NumElts = ResVT.getVectorNumElements();
+
+ // Create the return type of the instructions
+ SmallVector<EVT, 5> ListVTs;
+ for (unsigned i = 0; i < NumElts; ++i)
+ ListVTs.push_back(MVT::i32);
+
+ ListVTs.push_back(N->getValueType(1)); // Chain
+
+ SDVTList ResVTs = DAG.getVTList(ListVTs);
+
+ SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
+ N->getOperand(2)};
+
+ if (HasOffset) {
+ Ops.push_back(N->getOperand(3)); // offset
+ Ops.push_back(N->getOperand(4)); // Pack flag
+ } else
+ Ops.push_back(N->getOperand(3)); // Pack flag
+
+ MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+ SDValue NewNode =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops,
+ MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+ // split the vector result
+ SmallVector<SDValue, 4> ScalarRes;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewNode.getValue(i);
+ ScalarRes.push_back(Res);
+ }
+
+ SDValue Chain = NewNode.getValue(NumElts);
+ SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+ return {{BuildVector, Chain}};
+}
+
+static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
SDValue Intrin = N->getOperand(1);
@@ -2765,7 +2824,7 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
- return LowerTcgen05St(Op, DAG);
+ return lowerTcgen05St(Op, DAG);
case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
@@ -2867,6 +2926,28 @@ static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
SDValue Selector = (Op->op_end() - 1)->get();
return getPRMT(A, B, Selector, DL, DAG, Mode);
}
+
+static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) {
+ switch (Op->getConstantOperandVal(1)) {
+ default:
+ return Op;
+
+ // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
+ // lower them through LowerOperation() instead of ReplaceNodeResults().
+ case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
+ case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
+ case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
+ if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
+ return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
+ return SDValue();
+
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
+ if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
+ return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
+ return SDValue();
+ }
+}
+
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
switch (Op->getConstantOperandVal(0)) {
default:
@@ -3029,11 +3110,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST:
return LowerADDRSPACECAST(Op, DAG);
case ISD::INTRINSIC_W_CHAIN:
- return Op;
+ return lowerIntrinsicWChain(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return lowerIntrinsicWOChain(Op, DAG);
case ISD::INTRINSIC_VOID:
- return LowerIntrinsicVoid(Op, DAG);
+ return lowerIntrinsicVoid(Op, DAG);
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::BITCAST:
@@ -5920,7 +6001,7 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
IsPTXVectorType(VectorVT.getSimpleVT()))
return SDValue(); // Native vector loads already combine nicely w/
// extract_vector_elt.
- // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8),
+ // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
// we already handle them OK.
if (VectorVT.getVectorNumElements() == 1 ||
NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
@@ -6300,53 +6381,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
}
-// Lower vector return type of tcgen05.ld intrinsics
-static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results,
- bool hasOffset = false) {
- SDLoc DL(N);
- EVT ResVT = N->getValueType(0);
- if (!ResVT.isVector())
- return; // already legalized.
-
- const unsigned NumElts = ResVT.getVectorNumElements();
-
- // Create the return type of the instructions
- SmallVector<EVT, 5> ListVTs;
- for (unsigned i = 0; i < NumElts; ++i)
- ListVTs.push_back(MVT::i32);
-
- ListVTs.push_back(N->getValueType(1)); // Chain
-
- SDVTList ResVTs = DAG.getVTList(ListVTs);
-
- SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
- N->getOperand(2)};
-
- if (hasOffset) {
- Ops.push_back(N->getOperand(3)); // offset
- Ops.push_back(N->getOperand(4)); // Pack flag
- } else
- Ops.push_back(N->getOperand(3)); // Pack flag
-
- MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
- SDValue NewNode =
- DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops,
- MemSD->getMemoryVT(), MemSD->getMemOperand());
-
- // split the vector result
- SmallVector<SDValue, 4> ScalarRes;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewNode.getValue(i);
- ScalarRes.push_back(Res);
- }
-
- SDValue Chain = NewNode.getValue(NumElts);
- SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
- Results.push_back(BuildVector); // Build Vector
- Results.push_back(Chain); // Chain
-}
-
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) {
SDValue Chain = N->getOperand(0);
@@ -6455,21 +6489,18 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
return;
}
- case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
- case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
- case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
@@ -6482,16 +6513,23 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
- return ReplaceTcgen05Ld(N, DAG, Results);
+ if (auto Res = lowerTcgen05Ld(N, DAG)) {
+ Results.push_back(Res->first);
+ Results.push_back(Res->second);
+ }
+ return;
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
- return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ true);
+ if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
+ Results.push_back(Res->first);
+ Results.push_back(Res->second);
+ }
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 0b2bb694f1fd5..4cacee2290763 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -756,8 +756,10 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
(SELP_b32rr $a, $b, $p)>;
}
-def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)),
+foreach vt = [v2f32, v2i32] in {
+def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
(SELP_b64rr $a, $b, $p)>;
+}
//-----------------------------------
// Test Instructions
@@ -2101,8 +2103,8 @@ foreach vt = [v2f16, v2bf16, v2i16] in {
(V2I16toI32 $a, $b)>;
}
-// Same thing for the 64-bit type v2f32.
-foreach vt = [v2f32] in {
+// Handle extracting one element from the pair (64-bit types)
+foreach vt = [v2f32, v2i32] in {
def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>;
def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 2e81ab122d1df..913487b64617a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -54,7 +54,8 @@ def B16 : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>;
def B32 : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32,
(add (sequence "R%u", 0, 4),
VRFrame32, VRFrameLocal32)>;
-def B64 : NVPTXRegClass<[i64, v2f32, f64], 64, (add (sequence "RL%u", 0, 4),
+def B64 : NVPTXRegClass<[i64, v2i32, v2f32, f64], 64,
+ (add (sequence "RL%u", 0, 4),
VRFrame64, VRFrameLocal64)>;
// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
def B128 : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index a070789f85e0b..4b5cb30fd3036 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -99,8 +99,8 @@ namespace NVPTX {
// register. NOTE: This must be kept in sync with the register classes
// defined in NVPTXRegisterInfo.td.
inline auto packed_types() {
- static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16,
- MVT::v2i16, MVT::v2f32};
+ static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16,
+ MVT::v2i16, MVT::v2f32, MVT::v2i32};
return PackedTypes;
}
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
new file mode 100644
index 0000000000000..18fb87935d17d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \
+; RUN: %}
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+
+; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64.
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare <2 x i32> @return_i32x2(i32 %0)
+
+; Test with v2i32.
+define ptx_kernel void @store_i32x2(i32 %0, ptr %p) {
+; CHECK-SM90A-LABEL: store_i32x2(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1];
+; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0];
+; CHECK-SM90A-NEXT: { // callseq 0, 0
+; CHECK-SM90A-NEXT: .param .b32 param0;
+; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1;
+; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0);
+; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0];
+; CHECK-SM90A-NEXT: } // callseq 0
+; CHECK-SM90A-NEXT: add.rn.f32 %r4, %r3, %r3;
+; CHECK-SM90A-NEXT: add.rn.f32 %r5, %r2, %r2;
+; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r5, %r4};
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: store_i32x2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<2>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<4>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1];
+; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0];
+; CHECK-SM100-NEXT: { // callseq 0, 0
+; CHECK-SM100-NEXT: .param .b32 param0;
+; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-SM100-NEXT: st.param.b32 [param0], %r1;
+; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0);
+; CHECK-SM100-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-SM100-NEXT: } // callseq 0
+; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2;
+; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3;
+; CHECK-SM100-NEXT: ret;
+ %v = call <2 x i32> @return_i32x2(i32 %0)
+ %v.f32x2 = bitcast <2 x i32> %v to <2 x float>
+ %res = fadd <2 x float> %v.f32x2, %v.f32x2
+ store <2 x float> %res, ptr %p, align 8
+ ret void
+}
+
+; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to
+; v2i32.
+define ptx_kernel void @inlineasm(ptr %p) {
+; CHECK-SM90A-LABEL: inlineasm(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r3, 0;
+; CHECK-SM90A-NEXT: mov.b32 %r4, %r3;
+; CHECK-SM90A-NEXT: mov.b32 %r2, %r4;
+; CHECK-SM90A-NEXT: mov.b32 %r1, %r3;
+; CHECK-SM90A-NEXT: // begin inline asm
+; CHECK-SM90A-NEXT: // nop
+; CHECK-SM90A-NEXT: // end inline asm
+; CHECK-SM90A-NEXT: mul.rn.f32 %r5, %r2, 0f00000000;
+; CHECK-SM90A-NEXT: mul.rn.f32 %r6, %r1, 0f00000000;
+; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r6, %r5};
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: inlineasm(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<6>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b64 %rd1, [inlineasm_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r3, 0;
+; CHECK-SM100-NEXT: mov.b32 %r4, %r3;
+; CHECK-SM100-NEXT: mov.b32 %r2, %r4;
+; CHECK-SM100-NEXT: mov.b32 %r1, %r3;
+; CHECK-SM100-NEXT: // begin inline asm
+; CHECK-SM100-NEXT: // nop
+; CHECK-SM100-NEXT: // end inline asm
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2};
+; CHECK-SM100-NEXT: mov.b32 %r5, 0f00000000;
+; CHECK-SM100-NEXT: mov.b64 %rd3, {%r5, %r5};
+; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-SM100-NEXT: st.b64 [%rd1], %rd4;
+; CHECK-SM100-NEXT: ret;
+ %r = call { <1 x float>, <1 x float> } asm sideeffect "// nop", "=f,=f,0,1"(<1 x float> zeroinitializer, <1 x float> zeroinitializer)
+ %i0 = extractvalue { <1 x float>, <1 x float> } %r, 0
+ %i1 = extractvalue { <1 x float>, <1 x float> } %r, 1
+ %i4 = shufflevector <1 x float> %i0, <1 x float> %i1, <2 x i32> <i32 0, i32 1>
+ %mul = fmul < 2 x float> %i4, zeroinitializer
+ store <2 x float> %mul, ptr %p, align 8
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 217bb483682ff..a90cfff51e2c6 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -1938,16 +1938,29 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
}
define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_uitofp_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
-; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2;
-; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_uitofp_2xi32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_uitofp_2xi32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = uitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
@@ -1969,16 +1982,29 @@ define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
}
define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_sitofp_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
-; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2;
-; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_sitofp_2xi32(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_sitofp_2xi32(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sitofp_2xi32_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = sitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
@@ -2017,16 +2043,17 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
; CHECK-F32X2-LABEL: test_uitofp_2xi32_fadd(
; CHECK-F32X2: {
; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
-; CHECK-F32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<5>;
; CHECK-F32X2-EMPTY:
; CHECK-F32X2-NEXT: // %bb.0:
-; CHECK-F32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2;
; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1;
-; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r4, %r3};
-; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r4, %r3};
+; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4;
; CHECK-F32X2-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
@@ -2114,14 +2141,23 @@ define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
}
define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-F32X2-NEXT: ret;
%r = bitcast <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
@@ -2851,31 +2887,57 @@ define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 {
}
define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_sitofp_2xi32_to_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0];
-; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2;
-; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_sitofp_2xi32_to_2xfloat(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_sitofp_2xi32_to_2xfloat(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sitofp_2xi32_to_2xfloat_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = sitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_uitofp_2xi32_to_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0];
-; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2;
-; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
-; CHECK-NEXT: ret;
+; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_to_2xfloat(
+; CHECK-NOF32X2: {
+; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT: // %bb.0:
+; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0];
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r1;
+; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT: ret;
+;
+; CHECK-F32X2-LABEL: test_uitofp_2xi32_to_2xfloat(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_to_2xfloat_param_0];
+; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2;
+; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1;
+; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-F32X2-NEXT: ret;
%r = uitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
index d219493d2b31b..3fac29f74125b 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
@@ -346,15 +346,19 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) {
; SM100-LABEL: ld_global_v8i32(
; SM100: {
; SM100-NEXT: .reg .b32 %r<16>;
-; SM100-NEXT: .reg .b64 %rd<2>;
+; SM100-NEXT: .reg .b64 %rd<6>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0];
-; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT: add.s32 %r9, %r1, %r2;
-; SM100-NEXT: add.s32 %r10, %r3, %r4;
-; SM100-NEXT: add.s32 %r11, %r5, %r6;
-; SM100-NEXT: add.s32 %r12, %r7, %r8;
+; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT: mov.b64 {%r1, %r2}, %rd5;
+; SM100-NEXT: mov.b64 {%r3, %r4}, %rd4;
+; SM100-NEXT: mov.b64 {%r5, %r6}, %rd3;
+; SM100-NEXT: mov.b64 {%r7, %r8}, %rd2;
+; SM100-NEXT: add.s32 %r9, %r7, %r8;
+; SM100-NEXT: add.s32 %r10, %r5, %r6;
+; SM100-NEXT: add.s32 %r11, %r3, %r4;
+; SM100-NEXT: add.s32 %r12, %r1, %r2;
; SM100-NEXT: add.s32 %r13, %r9, %r10;
; SM100-NEXT: add.s32 %r14, %r11, %r12;
; SM100-NEXT: add.s32 %r15, %r13, %r14;
diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
index 12e3287e73f0f..57852451c0c72 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
@@ -82,11 +82,11 @@ define void @avar_bfloat() {
define void @avar_i32() {
; PTX-LABEL: avar_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
+; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
-; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
+; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0
store <8 x i32> %load, ptr addrspace(1) @globalout
@@ -202,11 +202,11 @@ define void @asi_bfloat() {
define void @asi_i32() {
; PTX-LABEL: asi_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
+; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
-; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
+; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
@@ -331,14 +331,13 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
-; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0];
-; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1];
-; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i32_param_1];
+; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT: ret;
%load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0
store <8 x i32> %load, ptr addrspace(1) %out
@@ -472,14 +471,13 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
-; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1];
-; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
-; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
+; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
index b7fa1dd5f2c4d..21604dfbf0013 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
@@ -78,11 +78,11 @@ define void @avar_bfloat() {
define void @avar_i32() {
; PTX-LABEL: avar_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
+; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
-; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
+; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%load = load <8 x i32>, ptr addrspace(1) @globalin
store <8 x i32> %load, ptr addrspace(1) @globalout
@@ -198,11 +198,11 @@ define void @asi_bfloat() {
define void @asi_i32() {
; PTX-LABEL: asi_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
+; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
-; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
+; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <8 x i32>, ptr addrspace(1) %in.offset
@@ -327,14 +327,13 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
-; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0];
-; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1];
-; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i32_param_1];
+; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT: ret;
%load = load <8 x i32>, ptr addrspace(1) %in
store <8 x i32> %load, ptr addrspace(1) %out
@@ -468,14 +467,13 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i32(
; PTX: {
-; PTX-NEXT: .reg .b32 %r<9>;
-; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1];
-; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
-; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
+; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <8 x i32>, ptr addrspace(1) %in.offset
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
index e8b43ad28ad27..b5319935f0f9d 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
@@ -100,19 +100,32 @@ define void @generic_16xbfloat(ptr %a, ptr %b) {
}
define void @generic_8xi32(ptr %a, ptr %b) {
-; CHECK-LABEL: generic_8xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0];
-; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1];
-; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT: ret;
+; SM90-LABEL: generic_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0];
+; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1];
+; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: generic_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0];
+; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [generic_8xi32_param_1];
+; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr %a
store <8 x i32> %a.load, ptr %b
ret void
@@ -265,19 +278,32 @@ define void @generic_volatile_16xbfloat(ptr %a, ptr %b) {
}
define void @generic_volatile_8xi32(ptr %a, ptr %b) {
-; CHECK-LABEL: generic_volatile_8xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0];
-; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1];
-; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT: ret;
+; SM90-LABEL: generic_volatile_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0];
+; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1];
+; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: generic_volatile_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0];
+; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xi32_param_1];
+; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr %a
store volatile <8 x i32> %a.load, ptr %b
ret void
@@ -496,14 +522,13 @@ define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
;
; SM100-LABEL: global_8xi32(
; SM100: {
-; SM100-NEXT: .reg .b32 %r<9>;
-; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
-; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1];
-; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd6, [global_8xi32_param_1];
+; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr addrspace(1) %a
store <8 x i32> %a.load, ptr addrspace(1) %b
@@ -741,14 +766,13 @@ define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
;
; SM100-LABEL: global_volatile_8xi32(
; SM100: {
-; SM100-NEXT: .reg .b32 %r<9>;
-; SM100-NEXT: .reg .b64 %rd<3>;
+; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0];
-; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_8xi32_param_1];
-; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_8xi32_param_1];
+; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(1) %a
store volatile <8 x i32> %a.load, ptr addrspace(1) %b
@@ -924,19 +948,32 @@ define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
}
define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) {
-; CHECK-LABEL: shared_8xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0];
-; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1];
-; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT: ret;
+; SM90-LABEL: shared_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0];
+; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1];
+; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: shared_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0];
+; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [shared_8xi32_param_1];
+; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr addrspace(3) %a
store <8 x i32> %a.load, ptr addrspace(3) %b
ret void
@@ -1089,19 +1126,32 @@ define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b)
}
define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) {
-; CHECK-LABEL: shared_volatile_8xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0];
-; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1];
-; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT: ret;
+; SM90-LABEL: shared_volatile_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0];
+; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1];
+; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: shared_volatile_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0];
+; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xi32_param_1];
+; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(3) %a
store volatile <8 x i32> %a.load, ptr addrspace(3) %b
ret void
@@ -1256,19 +1306,32 @@ define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
}
define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
-; CHECK-LABEL: local_8xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT: ret;
+; SM90-LABEL: local_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0];
+; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1];
+; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: local_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0];
+; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [local_8xi32_param_1];
+; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr addrspace(5) %a
store <8 x i32> %a.load, ptr addrspace(5) %b
ret void
@@ -1421,19 +1484,32 @@ define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b)
}
define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
-; CHECK-LABEL: local_volatile_8xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT: ret;
+; SM90-LABEL: local_volatile_8xi32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<9>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
+; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1];
+; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT: ret;
+;
+; SM100-LABEL: local_volatile_8xi32(
+; SM100: {
+; SM100-NEXT: .reg .b64 %rd<7>;
+; SM100-EMPTY:
+; SM100-NEXT: // %bb.0:
+; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
+; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xi32_param_1];
+; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(5) %a
store volatile <8 x i32> %a.load, ptr addrspace(5) %b
ret void
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index f871e4039a558..87787ba2bf81c 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -1452,22 +1452,44 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_add_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_add_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
-; CHECK-NEXT: add.s32 %r9, %r4, %r8;
-; CHECK-NEXT: add.s32 %r10, %r2, %r6;
-; CHECK-NEXT: add.s32 %r11, %r10, %r9;
-; CHECK-NEXT: add.s32 %r12, %r3, %r7;
-; CHECK-NEXT: add.s32 %r13, %r1, %r5;
-; CHECK-NEXT: add.s32 %r14, %r13, %r12;
-; CHECK-NEXT: add.s32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_add_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
+; CHECK-SM80-NEXT: add.s32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: add.s32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: add.s32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: add.s32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: add.s32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: add.s32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_add_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_add_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_add_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: add.s32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: add.s32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: add.s32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: add.s32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: add.s32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: add.s32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.add(<8 x i32> %in)
ret i32 %res
}
@@ -1543,22 +1565,44 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_mul_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_mul_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
-; CHECK-NEXT: mul.lo.s32 %r9, %r4, %r8;
-; CHECK-NEXT: mul.lo.s32 %r10, %r2, %r6;
-; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9;
-; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r7;
-; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r5;
-; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12;
-; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_mul_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
+; CHECK-SM80-NEXT: mul.lo.s32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: mul.lo.s32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: mul.lo.s32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: mul.lo.s32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: mul.lo.s32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: mul.lo.s32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: mul.lo.s32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_mul_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_mul_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_mul_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: mul.lo.s32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: mul.lo.s32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: mul.lo.s32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: mul.lo.s32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: mul.lo.s32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: mul.lo.s32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: mul.lo.s32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in)
ret i32 %res
}
@@ -1673,22 +1717,44 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_umax_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_umax_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
-; CHECK-NEXT: max.u32 %r9, %r4, %r8;
-; CHECK-NEXT: max.u32 %r10, %r2, %r6;
-; CHECK-NEXT: max.u32 %r11, %r10, %r9;
-; CHECK-NEXT: max.u32 %r12, %r3, %r7;
-; CHECK-NEXT: max.u32 %r13, %r1, %r5;
-; CHECK-NEXT: max.u32 %r14, %r13, %r12;
-; CHECK-NEXT: max.u32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_umax_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
+; CHECK-SM80-NEXT: max.u32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: max.u32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: max.u32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: max.u32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: max.u32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: max.u32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: max.u32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_umax_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_umax_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_umax_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: max.u32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: max.u32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: max.u32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: max.u32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: max.u32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: max.u32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: max.u32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in)
ret i32 %res
}
@@ -1803,22 +1869,44 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_umin_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_umin_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
-; CHECK-NEXT: min.u32 %r9, %r4, %r8;
-; CHECK-NEXT: min.u32 %r10, %r2, %r6;
-; CHECK-NEXT: min.u32 %r11, %r10, %r9;
-; CHECK-NEXT: min.u32 %r12, %r3, %r7;
-; CHECK-NEXT: min.u32 %r13, %r1, %r5;
-; CHECK-NEXT: min.u32 %r14, %r13, %r12;
-; CHECK-NEXT: min.u32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_umin_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
+; CHECK-SM80-NEXT: min.u32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: min.u32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: min.u32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: min.u32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: min.u32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: min.u32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: min.u32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_umin_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_umin_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_umin_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: min.u32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: min.u32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: min.u32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: min.u32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: min.u32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: min.u32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: min.u32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in)
ret i32 %res
}
@@ -1933,22 +2021,44 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_smax_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_smax_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
-; CHECK-NEXT: max.s32 %r9, %r4, %r8;
-; CHECK-NEXT: max.s32 %r10, %r2, %r6;
-; CHECK-NEXT: max.s32 %r11, %r10, %r9;
-; CHECK-NEXT: max.s32 %r12, %r3, %r7;
-; CHECK-NEXT: max.s32 %r13, %r1, %r5;
-; CHECK-NEXT: max.s32 %r14, %r13, %r12;
-; CHECK-NEXT: max.s32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_smax_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
+; CHECK-SM80-NEXT: max.s32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: max.s32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: max.s32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: max.s32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: max.s32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: max.s32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: max.s32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_smax_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_smax_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_smax_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: max.s32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: max.s32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: max.s32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: max.s32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: max.s32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: max.s32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: max.s32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in)
ret i32 %res
}
@@ -2063,22 +2173,44 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_smin_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_smin_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
-; CHECK-NEXT: min.s32 %r9, %r4, %r8;
-; CHECK-NEXT: min.s32 %r10, %r2, %r6;
-; CHECK-NEXT: min.s32 %r11, %r10, %r9;
-; CHECK-NEXT: min.s32 %r12, %r3, %r7;
-; CHECK-NEXT: min.s32 %r13, %r1, %r5;
-; CHECK-NEXT: min.s32 %r14, %r13, %r12;
-; CHECK-NEXT: min.s32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_smin_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
+; CHECK-SM80-NEXT: min.s32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: min.s32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: min.s32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: min.s32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: min.s32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: min.s32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: min.s32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_smin_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_smin_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_smin_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: min.s32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: min.s32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: min.s32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: min.s32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: min.s32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: min.s32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: min.s32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in)
ret i32 %res
}
@@ -2152,22 +2284,44 @@ define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_and_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_and_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
-; CHECK-NEXT: and.b32 %r9, %r4, %r8;
-; CHECK-NEXT: and.b32 %r10, %r2, %r6;
-; CHECK-NEXT: and.b32 %r11, %r10, %r9;
-; CHECK-NEXT: and.b32 %r12, %r3, %r7;
-; CHECK-NEXT: and.b32 %r13, %r1, %r5;
-; CHECK-NEXT: and.b32 %r14, %r13, %r12;
-; CHECK-NEXT: and.b32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_and_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
+; CHECK-SM80-NEXT: and.b32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: and.b32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: and.b32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: and.b32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: and.b32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: and.b32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: and.b32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_and_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_and_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_and_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: and.b32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: and.b32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: and.b32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: and.b32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: and.b32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: and.b32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: and.b32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.and(<8 x i32> %in)
ret i32 %res
}
@@ -2241,22 +2395,44 @@ define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_or_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_or_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
-; CHECK-NEXT: or.b32 %r9, %r4, %r8;
-; CHECK-NEXT: or.b32 %r10, %r2, %r6;
-; CHECK-NEXT: or.b32 %r11, %r10, %r9;
-; CHECK-NEXT: or.b32 %r12, %r3, %r7;
-; CHECK-NEXT: or.b32 %r13, %r1, %r5;
-; CHECK-NEXT: or.b32 %r14, %r13, %r12;
-; CHECK-NEXT: or.b32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_or_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
+; CHECK-SM80-NEXT: or.b32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: or.b32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: or.b32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: or.b32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: or.b32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: or.b32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: or.b32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_or_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_or_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_or_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: or.b32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: or.b32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: or.b32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: or.b32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: or.b32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: or.b32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: or.b32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.or(<8 x i32> %in)
ret i32 %res
}
@@ -2330,22 +2506,44 @@ define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_xor_i32(<8 x i32> %in) {
-; CHECK-LABEL: reduce_xor_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
-; CHECK-NEXT: xor.b32 %r9, %r4, %r8;
-; CHECK-NEXT: xor.b32 %r10, %r2, %r6;
-; CHECK-NEXT: xor.b32 %r11, %r10, %r9;
-; CHECK-NEXT: xor.b32 %r12, %r3, %r7;
-; CHECK-NEXT: xor.b32 %r13, %r1, %r5;
-; CHECK-NEXT: xor.b32 %r14, %r13, %r12;
-; CHECK-NEXT: xor.b32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM80-LABEL: reduce_xor_i32(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
+; CHECK-SM80-NEXT: xor.b32 %r9, %r4, %r8;
+; CHECK-SM80-NEXT: xor.b32 %r10, %r2, %r6;
+; CHECK-SM80-NEXT: xor.b32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: xor.b32 %r12, %r3, %r7;
+; CHECK-SM80-NEXT: xor.b32 %r13, %r1, %r5;
+; CHECK-SM80-NEXT: xor.b32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: xor.b32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_xor_i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<16>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_xor_i32_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_xor_i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM100-NEXT: xor.b32 %r5, %r4, %r2;
+; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM100-NEXT: xor.b32 %r10, %r9, %r7;
+; CHECK-SM100-NEXT: xor.b32 %r11, %r10, %r5;
+; CHECK-SM100-NEXT: xor.b32 %r12, %r3, %r1;
+; CHECK-SM100-NEXT: xor.b32 %r13, %r8, %r6;
+; CHECK-SM100-NEXT: xor.b32 %r14, %r13, %r12;
+; CHECK-SM100-NEXT: xor.b32 %r15, %r14, %r11;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM100-NEXT: ret;
%res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in)
ret i32 %res
}
More information about the llvm-commits
mailing list