[llvm] 00139f1 - [NVPTX] Cleanup ld/st lowering (#143936)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 17 09:00:21 PDT 2025
Author: Alex MacLean
Date: 2025-06-17T09:00:18-07:00
New Revision: 00139f10c3cd4118de7148635c820bb42843287a
URL: https://github.com/llvm/llvm-project/commit/00139f10c3cd4118de7148635c820bb42843287a
DIFF: https://github.com/llvm/llvm-project/commit/00139f10c3cd4118de7148635c820bb42843287a.diff
LOG: [NVPTX] Cleanup ld/st lowering (#143936)
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/bug26185-2.ll
llvm/test/CodeGen/NVPTX/bug26185.ll
llvm/test/CodeGen/NVPTX/i1-ext-load.ll
llvm/test/CodeGen/NVPTX/ldu-ldg.ll
llvm/test/CodeGen/NVPTX/variadics-backend.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 79b1bfbc8072b..ff10eea371049 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -136,7 +136,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
break;
case NVPTXISD::LDUV2:
case NVPTXISD::LDUV4:
- if (tryLDGLDU(N))
+ if (tryLDU(N))
return;
break;
case NVPTXISD::StoreV2:
@@ -324,7 +324,7 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_p:
- return tryLDGLDU(N);
+ return tryLDU(N);
case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
@@ -1048,35 +1048,28 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
assert(LD->readMem() && "Expected load");
// do not support pre/post inc/dec
- LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
+ const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
if (PlainLoad && PlainLoad->isIndexed())
return false;
- EVT LoadedVT = LD->getMemoryVT();
- if (!LoadedVT.isSimple())
+ const EVT LoadedEVT = LD->getMemoryVT();
+ if (!LoadedEVT.isSimple())
return false;
+ const MVT LoadedVT = LoadedEVT.getSimpleVT();
// Address Space Setting
const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
- return tryLDGLDU(N);
+ return tryLDG(LD);
- SDLoc DL(N);
+ SDLoc DL(LD);
SDValue Chain = N->getOperand(0);
- auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
+ const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
- // Type Setting: fromType + fromTypeWidth
- //
- // Sign : ISD::SEXTLOAD
- // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
- // type is integer
- // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
- MVT SimpleVT = LoadedVT.getSimpleVT();
- // Read at least 8 bits (predicates are stored as 8-bit values)
- unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
+ const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
// Vector Setting
- unsigned int FromType =
+ const unsigned FromType =
(PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
? NVPTX::PTXLdStInstCode::Signed
: NVPTX::PTXLdStInstCode::Untyped;
@@ -1102,29 +1095,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
if (!Opcode)
return false;
- SDNode *NVPTXLD =
- CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
+ SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
if (!NVPTXLD)
return false;
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+ MachineMemOperand *MemRef = LD->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
- ReplaceNode(N, NVPTXLD);
+ ReplaceNode(LD, NVPTXLD);
return true;
}
-static bool isSubVectorPackedInI32(EVT EltVT) {
- // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
- // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
- // vectorized loads/stores with the actual element type for i8/i16 as that
- // would require v8/v16 variants that do not exist.
- // In order to load/store such vectors efficiently, in Type Legalization
- // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
- // lower to PTX as vectors of b32.
- return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
-}
-
static unsigned getLoadStoreVectorNumElts(SDNode *N) {
switch (N->getOpcode()) {
case NVPTXISD::LoadV2:
@@ -1142,21 +1123,21 @@ static unsigned getLoadStoreVectorNumElts(SDNode *N) {
}
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
- MemSDNode *MemSD = cast<MemSDNode>(N);
- const EVT MemEVT = MemSD->getMemoryVT();
+ MemSDNode *LD = cast<MemSDNode>(N);
+ const EVT MemEVT = LD->getMemoryVT();
if (!MemEVT.isSimple())
return false;
const MVT MemVT = MemEVT.getSimpleVT();
// Address Space Setting
- const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
- return tryLDGLDU(N);
+ const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+ if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
+ return tryLDG(LD);
- EVT EltVT = N->getValueType(0);
- SDLoc DL(N);
- SDValue Chain = N->getOperand(0);
- auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+ const MVT EltVT = LD->getSimpleValueType(0);
+ SDLoc DL(LD);
+ SDValue Chain = LD->getChain();
+ const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
// Type Setting: fromType + fromTypeWidth
//
@@ -1167,18 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
// Read at least 8 bits (predicates are stored as 8-bit values)
// The last operand holds the original LoadSDNode::getExtensionType() value
const unsigned TotalWidth = MemVT.getSizeInBits();
- unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
- unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
- ? NVPTX::PTXLdStInstCode::Signed
- : NVPTX::PTXLdStInstCode::Untyped;
+ const unsigned ExtensionType =
+ N->getConstantOperandVal(N->getNumOperands() - 1);
+ const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+ ? NVPTX::PTXLdStInstCode::Signed
+ : NVPTX::PTXLdStInstCode::Untyped;
- unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
-
- if (isSubVectorPackedInI32(EltVT)) {
- assert(ExtensionType == ISD::NON_EXTLOAD);
- EltVT = MVT::i32;
- }
+ const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
+ assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
@@ -1196,192 +1174,183 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
std::optional<unsigned> Opcode;
switch (N->getOpcode()) {
default:
- return false;
+ llvm_unreachable("Unexpected opcode");
case NVPTXISD::LoadV2:
- Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2,
- NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2,
- NVPTX::LDV_i64_v2);
+ Opcode =
+ pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v2, NVPTX::LDV_i16_v2,
+ NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
break;
case NVPTXISD::LoadV4:
- Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
- NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4,
- NVPTX::LDV_i64_v4);
+ Opcode =
+ pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v4, NVPTX::LDV_i16_v4,
+ NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
break;
case NVPTXISD::LoadV8:
- Opcode =
- pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
- {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */});
+ Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i8 */}, {/* no v8i16 */},
+ NVPTX::LDV_i32_v8, {/* no v8i64 */});
break;
}
if (!Opcode)
return false;
- SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+ SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
- CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
+ MachineMemOperand *MemRef = LD->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
- ReplaceNode(N, LD);
+ ReplaceNode(LD, NVPTXLD);
return true;
}
-bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
- auto *Mem = cast<MemSDNode>(N);
-
- // If this is an LDG intrinsic, the address is the third operand. If its an
- // LDG/LDU SD node (from custom vector handling), then its the second operand
- SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
+ const EVT LoadedEVT = LD->getMemoryVT();
+ if (!LoadedEVT.isSimple())
+ return false;
+ const MVT LoadedVT = LoadedEVT.getSimpleVT();
- const EVT OrigType = N->getValueType(0);
- EVT EltVT = Mem->getMemoryVT();
- unsigned NumElts = 1;
+ SDLoc DL(LD);
- if (EltVT == MVT::i128 || EltVT == MVT::f128) {
- EltVT = MVT::i64;
- NumElts = 2;
- }
- if (EltVT.isVector()) {
- NumElts = EltVT.getVectorNumElements();
- EltVT = EltVT.getVectorElementType();
- // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
- // elements.
- if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
- (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
- (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
- (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
- assert(NumElts % OrigType.getVectorNumElements() == 0 &&
- "NumElts must be divisible by the number of elts in subvectors");
- EltVT = OrigType;
- NumElts /= OrigType.getVectorNumElements();
- }
+ const unsigned TotalWidth = LoadedVT.getSizeInBits();
+ unsigned ExtensionType;
+ unsigned NumElts;
+ if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
+ ExtensionType = Load->getExtensionType();
+ NumElts = 1;
+ } else {
+ ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
+ NumElts = getLoadStoreVectorNumElts(LD);
}
+ const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+ ? NVPTX::PTXLdStInstCode::Signed
+ : NVPTX::PTXLdStInstCode::Untyped;
- // Build the "promoted" result VTList for the load. If we are really loading
- // i8s, then the return type will be promoted to i16 since we do not expose
- // 8-bit registers in NVPTX.
- const EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
- SmallVector<EVT, 5> InstVTs;
- InstVTs.append(NumElts, NodeVT);
- InstVTs.push_back(MVT::Other);
- SDVTList InstVTList = CurDAG->getVTList(InstVTs);
- SDValue Chain = N->getOperand(0);
+ const unsigned FromTypeWidth = TotalWidth / NumElts;
+
+ assert(!(LD->getSimpleValueType(0).isVector() &&
+ ExtensionType != ISD::NON_EXTLOAD));
+ assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+ FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
SDValue Base, Offset;
- SelectADDR(Op1, Base, Offset);
- SDValue Ops[] = {Base, Offset, Chain};
+ SelectADDR(LD->getOperand(1), Base, Offset);
+ SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
+ Offset, LD->getChain()};
+ const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
std::optional<unsigned> Opcode;
- switch (N->getOpcode()) {
+ switch (LD->getOpcode()) {
default:
- return false;
+ llvm_unreachable("Unexpected opcode");
case ISD::LOAD:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
- NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
- NVPTX::INT_PTX_LDG_GLOBAL_i64);
- break;
- case ISD::INTRINSIC_W_CHAIN:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
- NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
- NVPTX::INT_PTX_LDU_GLOBAL_i64);
+ Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i8,
+ NVPTX::LD_GLOBAL_NC_i16, NVPTX::LD_GLOBAL_NC_i32,
+ NVPTX::LD_GLOBAL_NC_i64);
break;
case NVPTXISD::LoadV2:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
- NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
- NVPTX::INT_PTX_LDG_G_v2i64_ELE);
- break;
- case NVPTXISD::LDUV2:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
- NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
- NVPTX::INT_PTX_LDU_G_v2i64_ELE);
+ TargetVT, NVPTX::LD_GLOBAL_NC_v2i8, NVPTX::LD_GLOBAL_NC_v2i16,
+ NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
break;
case NVPTXISD::LoadV4:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
- NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
- NVPTX::INT_PTX_LDG_G_v4i64_ELE);
- break;
- case NVPTXISD::LDUV4:
- Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
- NVPTX::INT_PTX_LDU_G_v4i8_ELE,
- NVPTX::INT_PTX_LDU_G_v4i16_ELE,
- NVPTX::INT_PTX_LDU_G_v4i32_ELE, {/* no v4i64 */});
+ TargetVT, NVPTX::LD_GLOBAL_NC_v4i8, NVPTX::LD_GLOBAL_NC_v4i16,
+ NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
break;
case NVPTXISD::LoadV8:
- Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
- {/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE,
- {/* no v8i64 */});
+ Opcode = pickOpcodeForVT(TargetVT, {/* no v8i8 */}, {/* no v8i16 */},
+ NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
break;
}
if (!Opcode)
return false;
- SDLoc DL(N);
- SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
+ SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
- // For automatic generation of LDG (through SelectLoad[Vector], not the
- // intrinsics), we may have an extending load like:
- //
- // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
- //
- // In this case, the matching logic above will select a load for the original
- // memory type (in this case, i8) and our types will not match (the node needs
- // to return an i32 in this case). Our LDG/LDU nodes do not support the
- // concept of sign-/zero-extension, so emulate it here by adding an explicit
- // CVT instruction. Ptxas should clean up any redundancies here.
-
- LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
-
- if (OrigType != EltVT &&
- (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
- // We have an extending-load. The instruction we selected operates on the
- // smaller type, but the SDNode we are replacing has the larger type. We
- // need to emit a CVT to make the types match.
- unsigned CvtOpc =
- GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
-
- // For each output value, apply the manual sign/zero-extension and make sure
- // all users of the load go through that CVT.
- for (unsigned i = 0; i != NumElts; ++i) {
- SDValue Res(LD, i);
- SDValue OrigVal(N, i);
-
- SDNode *CvtNode =
- CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
- CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
- DL, MVT::i32));
- ReplaceUses(OrigVal, SDValue(CvtNode, 0));
- }
+ ReplaceNode(LD, NVPTXLDG);
+ return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
+ auto *LD = cast<MemSDNode>(N);
+
+ unsigned NumElts;
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case ISD::INTRINSIC_W_CHAIN:
+ NumElts = 1;
+ break;
+ case NVPTXISD::LDUV2:
+ NumElts = 2;
+ break;
+ case NVPTXISD::LDUV4:
+ NumElts = 4;
+ break;
}
- ReplaceNode(N, LD);
+ const MVT::SimpleValueType SelectVT =
+ MVT::getIntegerVT(LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy;
+
+ // If this is an LDU intrinsic, the address is the third operand. If its an
+ // LDU SD node (from custom vector handling), then its the second operand
+ SDValue Addr =
+ LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+
+ SDValue Base, Offset;
+ SelectADDR(Addr, Base, Offset);
+ SDValue Ops[] = {Base, Offset, LD->getChain()};
+
+ std::optional<unsigned> Opcode;
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case ISD::INTRINSIC_W_CHAIN:
+ Opcode =
+ pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_i8, NVPTX::LDU_GLOBAL_i16,
+ NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
+ break;
+ case NVPTXISD::LDUV2:
+ Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v2i8,
+ NVPTX::LDU_GLOBAL_v2i16, NVPTX::LDU_GLOBAL_v2i32,
+ NVPTX::LDU_GLOBAL_v2i64);
+ break;
+ case NVPTXISD::LDUV4:
+ Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v4i8,
+ NVPTX::LDU_GLOBAL_v4i16, NVPTX::LDU_GLOBAL_v4i32,
+ {/* no v4i64 */});
+ break;
+ }
+ if (!Opcode)
+ return false;
+
+ SDLoc DL(N);
+ SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
+
+ ReplaceNode(LD, NVPTXLDU);
return true;
}
bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MemSDNode *ST = cast<MemSDNode>(N);
assert(ST->writeMem() && "Expected store");
- StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
- AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
+ StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
+ AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
assert((PlainStore || AtomicStore) && "Expected store");
// do not support pre/post inc/dec
if (PlainStore && PlainStore->isIndexed())
return false;
- EVT StoreVT = ST->getMemoryVT();
+ const EVT StoreVT = ST->getMemoryVT();
if (!StoreVT.isSimple())
return false;
// Address Space Setting
- unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
+ const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
- SDLoc DL(N);
+ SDLoc DL(ST);
SDValue Chain = ST->getChain();
- auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
+ const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
// Vector Setting
const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
@@ -1417,85 +1386,78 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!NVPTXST)
return false;
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+ MachineMemOperand *MemRef = ST->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
- ReplaceNode(N, NVPTXST);
+ ReplaceNode(ST, NVPTXST);
return true;
}
bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
- SDValue Op1 = N->getOperand(1);
- EVT EltVT = Op1.getValueType();
- MemSDNode *MemSD = cast<MemSDNode>(N);
- EVT StoreVT = MemSD->getMemoryVT();
+ MemSDNode *ST = cast<MemSDNode>(N);
+ const EVT StoreVT = ST->getMemoryVT();
assert(StoreVT.isSimple() && "Store value is not simple");
// Address Space Setting
- unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+ const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
report_fatal_error("Cannot store to pointer that points to constant "
"memory space");
}
- SDLoc DL(N);
- SDValue Chain = N->getOperand(0);
- auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+ SDLoc DL(ST);
+ SDValue Chain = ST->getChain();
+ const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
- unsigned NumElts = getLoadStoreVectorNumElts(N);
-
- SmallVector<SDValue, 16> Ops(N->ops().slice(1, NumElts));
- SDValue N2 = N->getOperand(NumElts + 1);
- unsigned ToTypeWidth = TotalWidth / NumElts;
+ const unsigned NumElts = getLoadStoreVectorNumElts(ST);
- if (isSubVectorPackedInI32(EltVT)) {
- EltVT = MVT::i32;
- }
+ SmallVector<SDValue, 16> Ops(ST->ops().slice(1, NumElts));
+ SDValue Addr = N->getOperand(NumElts + 1);
+ const unsigned ToTypeWidth = TotalWidth / NumElts;
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
TotalWidth <= 256 && "Invalid width for store");
SDValue Offset, Base;
- SelectADDR(N2, Base, Offset);
+ SelectADDR(Addr, Base, Offset);
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
+ const MVT::SimpleValueType EltVT =
+ ST->getOperand(1).getSimpleValueType().SimpleTy;
std::optional<unsigned> Opcode;
- switch (N->getOpcode()) {
+ switch (ST->getOpcode()) {
default:
return false;
case NVPTXISD::StoreV2:
- Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2,
- NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
- NVPTX::STV_i64_v2);
+ Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v2, NVPTX::STV_i16_v2,
+ NVPTX::STV_i32_v2, NVPTX::STV_i64_v2);
break;
case NVPTXISD::StoreV4:
- Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
- NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
- NVPTX::STV_i64_v4);
+ Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v4, NVPTX::STV_i16_v4,
+ NVPTX::STV_i32_v4, NVPTX::STV_i64_v4);
break;
case NVPTXISD::StoreV8:
- Opcode =
- pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
- {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */});
+ Opcode = pickOpcodeForVT(EltVT, {/* no v8i8 */}, {/* no v8i16 */},
+ NVPTX::STV_i32_v8, {/* no v8i64 */});
break;
}
if (!Opcode)
return false;
- SDNode *ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
+ SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
- MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
- CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
+ MachineMemOperand *MemRef = ST->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
- ReplaceNode(N, ST);
+ ReplaceNode(ST, NVPTXST);
return true;
}
@@ -2285,70 +2247,6 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
ReplaceNode(N, Mov);
}
-/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
-/// conversion from \p SrcTy to \p DestTy.
-unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
- LoadSDNode *LdNode) {
- bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
- switch (SrcTy.SimpleTy) {
- default:
- llvm_unreachable("Unhandled source type");
- case MVT::i8:
- switch (DestTy.SimpleTy) {
- default:
- llvm_unreachable("Unhandled dest type");
- case MVT::i16:
- return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
- case MVT::i32:
- return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
- case MVT::i64:
- return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
- }
- case MVT::i16:
- switch (DestTy.SimpleTy) {
- default:
- llvm_unreachable("Unhandled dest type");
- case MVT::i8:
- return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
- case MVT::i32:
- return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
- case MVT::i64:
- return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
- }
- case MVT::i32:
- switch (DestTy.SimpleTy) {
- default:
- llvm_unreachable("Unhandled dest type");
- case MVT::i8:
- return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
- case MVT::i16:
- return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
- case MVT::i64:
- return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
- }
- case MVT::i64:
- switch (DestTy.SimpleTy) {
- default:
- llvm_unreachable("Unhandled dest type");
- case MVT::i8:
- return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
- case MVT::i16:
- return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
- case MVT::i32:
- return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
- }
- case MVT::f16:
- switch (DestTy.SimpleTy) {
- default:
- llvm_unreachable("Unhandled dest type");
- case MVT::f32:
- return NVPTX::CVT_f32_f16;
- case MVT::f64:
- return NVPTX::CVT_f64_f16;
- }
- }
-}
-
bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
SDLoc DL(N);
assert(N->getOpcode() == ISD::ATOMIC_FENCE);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 473f4781a6c38..ff58e4486a222 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectTexSurfHandle(SDNode *N);
bool tryLoad(SDNode *N);
bool tryLoadVector(SDNode *N);
- bool tryLDGLDU(SDNode *N);
+ bool tryLDU(SDNode *N);
+ bool tryLDG(MemSDNode *N);
bool tryStore(SDNode *N);
bool tryStoreVector(SDNode *N);
bool tryLoadParam(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4c3501df57f84..5dbdce52f0553 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
-def hasVote : Predicate<"Subtarget->hasVote()">;
-def hasDouble : Predicate<"Subtarget->hasDouble()">;
def hasClusters : Predicate<"Subtarget->hasClusters()">;
-def hasLDG : Predicate<"Subtarget->hasLDG()">;
-def hasLDU : Predicate<"Subtarget->hasLDU()">;
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b3c1296cf0ca6..5de3dee1fb344 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
class LDU_G<string TyStr, NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
- "ldu.global." # TyStr # " \t$result, [$src];",
- []>, Requires<[hasLDU]>;
+ "ldu.global." # TyStr # " \t$result, [$src];", []>;
-def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
-def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
-def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
-def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
+def LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
+def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
+def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
+def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
// vector
@@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
-def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
+def LDU_GLOBAL_v2i8 : VLDU_G_ELE_V2<"b8", Int16Regs>;
+def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
+def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
+def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
+def LDU_GLOBAL_v4i8 : VLDU_G_ELE_V4<"b8", Int16Regs>;
+def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
+def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;
//-----------------------------------
@@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
// non-coherent texture cache, and therefore the values read must be read-only
// during the lifetime of the kernel.
-class LDG_G<string TyStr, NVPTXRegClass regclass>
- : NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
- "ld.global.nc." # TyStr # " \t$result, [$src];",
- []>, Requires<[hasLDG]>;
+class LDG_G<NVPTXRegClass regclass>
+ : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
-def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
-def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
-def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
-def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
+def LD_GLOBAL_NC_i8 : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
+def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;
// vector
// Elementized vector ldg
-class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
- (ins ADDR:$src),
- "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
-class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins ADDR:$src),
- "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
-class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
- (ins ADDR:$src),
- "ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
+ (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
-
-def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
-
-def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
-def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
+def LD_GLOBAL_NC_v2i8 : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
+def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;
+
+def LD_GLOBAL_NC_v4i8 : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;
+
+def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
+def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
if Supports32 then
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index c4d1537557cad..4e11f58f85ee0 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
@@ -10,14 +11,29 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
-; CHECK-LABEL: spam
define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
+; CHECK-LABEL: spam(
+; CHECK: .maxntid 1, 1, 1
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %bb
+; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [spam_param_3];
+; CHECK-NEXT: shl.b64 %rd3, %rd2, 1;
+; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
+; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1];
+; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16];
+; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1;
+; CHECK-NEXT: ld.global.b64 %rd7, [%rd5];
+; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7;
+; CHECK-NEXT: st.global.b64 [%rd5], %rd8;
+; CHECK-NEXT: ret;
bb:
%tmp5 = add nsw i64 %arg3, 8
%tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
-; CHECK: ld.global.nc.b16
%tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
-; CHECK: cvt.s32.s16
%tmp8 = sext i16 %tmp7 to i64
%tmp9 = mul nsw i64 %tmp8, %tmp8
%tmp10 = load i64, ptr addrspace(1) %arg1, align 8
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 3b30ce560edbc..6148c0756e393 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
@@ -7,45 +8,93 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-unknown-unknown"
-; CHECK-LABEL: ex_zext
define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [ex_zext_param_0];
+; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_param_1];
+; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd2];
+; CHECK-NEXT: st.global.b32 [%rd4], %r1;
+; CHECK-NEXT: ret;
entry:
-; CHECK: ld.global.nc.b8
%val = load i8, ptr %data
-; CHECK: cvt.u32.u8
%valext = zext i8 %val to i32
store i32 %valext, ptr %res
ret void
}
-; CHECK-LABEL: ex_sext
define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [ex_sext_param_0];
+; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_param_1];
+; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT: ld.global.nc.s8 %r1, [%rd2];
+; CHECK-NEXT: st.global.b32 [%rd4], %r1;
+; CHECK-NEXT: ret;
entry:
-; CHECK: ld.global.nc.b8
%val = load i8, ptr %data
-; CHECK: cvt.s32.s8
%valext = sext i8 %val to i32
store i32 %valext, ptr %res
ret void
}
-; CHECK-LABEL: ex_zext_v2
define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext_v2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [ex_zext_v2_param_0];
+; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_v2_param_1];
+; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.global.v2.b32 [%rd4], {%r2, %r1};
+; CHECK-NEXT: ret;
entry:
-; CHECK: ld.global.nc.v2.b8
%val = load <2 x i8>, ptr %data
-; CHECK: cvt.u32.u16
%valext = zext <2 x i8> %val to <2 x i32>
store <2 x i32> %valext, ptr %res
ret void
}
-; CHECK-LABEL: ex_sext_v2
define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext_v2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [ex_sext_v2_param_0];
+; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_v2_param_1];
+; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT: cvt.s32.s8 %r2, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT: cvt.s32.s8 %r4, %r3;
+; CHECK-NEXT: st.global.v2.b32 [%rd4], {%r4, %r2};
+; CHECK-NEXT: ret;
entry:
-; CHECK: ld.global.nc.v2.b8
%val = load <2 x i8>, ptr %data
-; CHECK: cvt.s32.s8
%valext = sext <2 x i8> %val to <2 x i32>
store <2 x i32> %valext, ptr %res
ret void
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index bb88d1f2755ca..3dceefb93a47d 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -7,7 +7,6 @@ target triple = "nvptx-nvidia-cuda"
define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
; CHECK-LABEL: foo(
-; CHECK: .reg .b16 %rs<2>;
; CHECK: .reg .b32 %r<4>;
; CHECK: .reg .b64 %rd<5>;
; CHECK-EMPTY:
@@ -15,8 +14,7 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
; CHECK: cvta.to.global.u64 %rd2, %rd1;
; CHECK: ld.param.b64 %rd3, [foo_param_1];
; CHECK: cvta.to.global.u64 %rd4, %rd3;
-; CHECK: ld.global.nc.b8 %rs1, [%rd2];
-; CHECK: cvt.u32.u8 %r1, %rs1;
+; CHECK: ld.global.nc.b8 %r1, [%rd2];
; CHECK: add.s32 %r2, %r1, 1;
; CHECK: and.b32 %r3, %r2, 1;
; CHECK: st.global.b32 [%rd4], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 7ac697c4ce203..7f4b049af84fb 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
; CHECK-LABEL: test_ldg_i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i8_param_0];
-; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd1];
-; CHECK-NEXT: cvt.u32.u8 %r1, %rs1;
+; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd1];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
@@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
; CHECK-LABEL: test_ldg_i16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i16_param_0];
-; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: ld.global.nc.b16 %r1, [%rd1];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 3bbdf641ade26..ddaa9fd831af7 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -211,7 +211,7 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
-; CHECK-PTX-NEXT: .reg .b16 %rs<8>;
+; CHECK-PTX-NEXT: .reg .b16 %rs<5>;
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
; CHECK-PTX-EMPTY:
@@ -220,18 +220,15 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
-; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
-; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
-; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3;
-; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT: ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
-; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5;
-; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs6;
+; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1;
+; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
+; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
+; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
+; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
-; CHECK-PTX-NEXT: mov.b16 %rs7, 1;
-; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs7;
+; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
+; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4;
; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3;
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;
More information about the llvm-commits
mailing list