[llvm] [NVPTX] Fixup EXT_LOAD lowering for i128 values (PR #138049)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 30 16:14:25 PDT 2025
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/138049
None
>From 403c693781a82306d63ca97f9f37f4ede6859648 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 30 Apr 2025 23:10:00 +0000
Subject: [PATCH] [NVPTX] Fixup EXT_LOAD lowering for i128 values
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 13 ++++++++++
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 27 +++++++++++---------
llvm/test/CodeGen/NVPTX/i128-ld-st.ll | 28 +++++++++++++++++++++
3 files changed, 56 insertions(+), 12 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/i128-ld-st.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 295ed666a1902..57f35a827d9cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,6 +26,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
#include <optional>
using namespace llvm;
@@ -1141,6 +1142,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
else
FromType = getLdStRegType(ScalarVT);
+ assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+ FromTypeWidth <= 128 && "Invalid width for load");
+
// Create the machine instruction DAG
SDValue Offset, Base;
SelectADDR(N->getOperand(1), Base, Offset);
@@ -1236,6 +1240,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
FromType = NVPTX::PTXLdStInstCode::Untyped;
}
+ assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+ FromTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for load");
+
SDValue Offset, Base;
SelectADDR(N->getOperand(1), Base, Offset);
SDValue Ops[] = {getI32Imm(Ordering, DL),
@@ -1453,6 +1460,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
// Create the machine instruction DAG
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
+ assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 &&
+ ToTypeWidth <= 128 && "Invalid width for store");
+
SDValue Offset, Base;
SelectADDR(ST->getBasePtr(), Base, Offset);
@@ -1537,6 +1547,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToType = NVPTX::PTXLdStInstCode::Untyped;
}
+ assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 &&
+ ToTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for store");
+
SDValue Offset, Base;
SelectADDR(N2, Base, Offset);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c41741ed10232..2f24667cb3cde 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3191,20 +3191,22 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDValue
NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
- SDNode *N = Op.getNode();
+ MemSDNode *N = cast<MemSDNode>(Op.getNode());
SDValue Val = N->getOperand(1);
SDLoc DL(N);
- EVT ValVT = Val.getValueType();
+ const EVT ValVT = Val.getValueType();
+ const EVT MemVT = N->getMemoryVT();
+ if (ValVT != MemVT)
+ return SDValue();
const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT);
if (!NumEltsAndEltVT)
return SDValue();
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
- MemSDNode *MemSD = cast<MemSDNode>(N);
const DataLayout &TD = DAG.getDataLayout();
- Align Alignment = MemSD->getAlign();
+ Align Alignment = N->getAlign();
Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
if (Alignment < PrefAlign) {
// This store is not sufficiently aligned, so bail out and let this vector
@@ -3267,7 +3269,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDValue NewSt =
DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
- MemSD->getMemoryVT(), MemSD->getMemOperand());
+ N->getMemoryVT(), N->getMemOperand());
// return DCI.CombineTo(N, NewSt, true);
return NewSt;
@@ -5762,20 +5764,20 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) {
- const EVT ResVT = N->getValueType(0);
- SDLoc DL(N);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ const EVT ResVT = LD->getValueType(0);
+ const EVT MemVT = LD->getMemoryVT();
+ if (ResVT != MemVT)
+ return;
const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
if (!NumEltsAndEltVT)
return;
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
Align Alignment = LD->getAlign();
const auto &TD = DAG.getDataLayout();
- Align PrefAlign =
- TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+ Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
if (Alignment < PrefAlign) {
// This load is not sufficiently aligned, so bail out and let this vector
// load be scalarized. Note that we may still be able to emit smaller
@@ -5806,9 +5808,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
break;
}
}
+ SDLoc DL(LD);
// Copy regular operands
- SmallVector<SDValue, 8> OtherOps(N->ops());
+ SmallVector<SDValue, 8> OtherOps(LD->ops());
// The select routine does not have access to the LoadSDNode instance, so
// pass along the extension information
diff --git a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
new file mode 100644
index 0000000000000..f3d04f133a8d4
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O0 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i128 @foo() {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: bra.uni $L__BB0_1;
+; CHECK-NEXT: $L__BB0_1: // %while.cond
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mov.b64 %rd1, 0;
+; CHECK-NEXT: ld.u8 %rd2, [%rd1];
+; CHECK-NEXT: st.v2.u64 [%rd1], {%rd2, %rd1};
+; CHECK-NEXT: bra.uni $L__BB0_1;
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %0 = load i8, ptr null, align 1
+ %conv = zext i8 %0 to i128
+ store i128 %conv, ptr null, align 16
+ br label %while.cond
+}
More information about the llvm-commits
mailing list