[llvm] [NVPTX] Switch to imm offset variants for LDG and LDU (PR #128270)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 18:05:44 PST 2025
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/128270
None
>From 391e45194b2e945799108fea9d155cf3d55e26c8 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 21 Feb 2025 18:45:49 +0000
Subject: [PATCH] [NVPTX] Switch to imm offset variants for LDG and LDU
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 86 +++++-----
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 169 ++++++++-----------
llvm/test/CodeGen/NVPTX/variadics-backend.ll | 44 +++--
3 files changed, 125 insertions(+), 174 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6f0bf510ad893..971a128aadfdb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
break;
}
- if (!Opcode)
- return false;
- Ops.append({Base, Offset, Chain});
} else {
if (PointerSize == 64) {
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
break;
}
}
- if (!Opcode)
- return false;
- Ops.append({Base, Offset, Chain});
}
+ if (!Opcode)
+ return false;
+ Ops.append({Base, Offset, Chain});
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
std::optional<unsigned> Opcode;
SDLoc DL(N);
SDNode *LD;
- SDValue Base, Offset, Addr;
+ SDValue Base, Offset;
- if (SelectDirectAddr(Op1, Addr)) {
+ if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
switch (N->getOpcode()) {
default:
return false;
case ISD::LOAD:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
- NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
- NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
- NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
+ NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
+ NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
+ NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
break;
case ISD::INTRINSIC_W_CHAIN:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
- NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
- NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
- NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
+ NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
+ NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
+ NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
break;
case NVPTXISD::LoadV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
- NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
+ NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
break;
case NVPTXISD::LDUV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
- NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
+ NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
break;
case NVPTXISD::LoadV4:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
- NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
+ NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
break;
case NVPTXISD::LDUV4:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
- NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
+ NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
break;
}
- if (!Opcode)
- return false;
- SDValue Ops[] = { Addr, Chain };
- LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
} else {
if (TM.is64Bit()) {
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
break;
}
}
- if (!Opcode)
- return false;
- SDValue Ops[] = {Base, Offset, Chain};
- LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
}
+ if (!Opcode)
+ return false;
+ SDValue Ops[] = {Base, Offset, Chain};
+ LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
// For automatic generation of LDG (through SelectLoad[Vector], not the
// intrinsics), we may have an extending load like:
@@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
break;
}
- Ops.append({Base, Offset});
} else {
if (PointerSize == 64) {
SelectADDRri64(N2.getNode(), N2, Base, Offset);
@@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
break;
}
}
- Ops.append({Base, Offset});
}
if (!Opcode)
return false;
-
- Ops.push_back(Chain);
+ Ops.append({Base, Offset, Chain});
ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index acb9fc9867b0f..eca2397ff3f26 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2693,23 +2693,23 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
// Scalar
multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
- def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
- !strconcat("ldu.global.", TyStr),
+ def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+ "ldu.global." # TyStr # " \t$result, [$src$offset];",
[]>, Requires<[hasLDU]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
- !strconcat("ldu.global.", TyStr),
+ "ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr),
+ "ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
}
-defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
// vector
@@ -2717,56 +2717,40 @@ defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
- !strconcat("ldu.global.", TyStr), []>;
+ "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
- (ins imemAny:$src),
- !strconcat("ldu.global.", TyStr), []>;
+ "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src, Offseti32imm:$offset),
+ "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
}
multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
- !strconcat("ldu.global.", TyStr), []>;
+ "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins imemAny:$src),
- !strconcat("ldu.global.", TyStr), []>;
-}
-
-defm INT_PTX_LDU_G_v2i8_ELE
- : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v2i16_ELE
- : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v2i32_ELE
- : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDU_G_v2f32_ELE
- : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDU_G_v2i64_ELE
- : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDU_G_v2f64_ELE
- : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDU_G_v4i8_ELE
- : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v4i16_ELE
- : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int16Regs>;
-defm INT_PTX_LDU_G_v4i32_ELE
- : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int32Regs>;
-defm INT_PTX_LDU_G_v4f16_ELE
- : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int16Regs>;
-defm INT_PTX_LDU_G_v4f16x2_ELE
- : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int32Regs>;
-defm INT_PTX_LDU_G_v4f32_ELE
- : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Float32Regs>;
+ "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+ "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;
//-----------------------------------
@@ -2778,29 +2762,23 @@ defm INT_PTX_LDU_G_v4f32_ELE
// during the lifetime of the kernel.
multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
- def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
- !strconcat("ld.global.nc.", TyStr),
+ def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+ "ld.global.nc." # TyStr # " \t$result, [$src$offset];",
[]>, Requires<[hasLDG]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr),
+ "ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr),
+ "ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
}
-defm INT_PTX_LDG_GLOBAL_i8
- : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i16
- : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i32
- : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDG_GLOBAL_i64
- : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDG_GLOBAL_f32
- : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDG_GLOBAL_f64
- : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
// vector
@@ -2808,54 +2786,39 @@ defm INT_PTX_LDG_GLOBAL_f64
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
- (ins imemAny:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src, Offseti32imm:$offset),
+ "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
}
multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
- def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int32Regs:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
- def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int64Regs:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins imemAny:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+ "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
}
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-defm INT_PTX_LDG_G_v2i8_ELE
- : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v2i16_ELE
- : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v2i32_ELE
- : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v2f32_ELE
- : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDG_G_v2i64_ELE
- : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDG_G_v2f64_ELE
- : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDG_G_v4i8_ELE
- : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i16_ELE
- : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i32_ELE
- : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v4f32_ELE
- : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
multiclass NG_TO_G<string Str> {
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 4d4db21c6ed0d..377528b94f505 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -214,34 +214,33 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<8>;
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<6>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT: mov.u64 %rd3, __const_$_bar_$_s1;
-; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [%rd3+7];
+; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [__const_$_bar_$_s1+7];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
; CHECK-PTX-NEXT: st.local.u8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd3+6];
+; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [__const_$_bar_$_s1+6];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3;
; CHECK-PTX-NEXT: st.local.u8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [%rd3+5];
+; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [__const_$_bar_$_s1+5];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5;
; CHECK-PTX-NEXT: st.local.u8 [%rd2], %rs6;
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1;
; CHECK-PTX-NEXT: mov.b16 %rs7, 1;
; CHECK-PTX-NEXT: st.u8 [%SP+12], %rs7;
-; CHECK-PTX-NEXT: mov.b64 %rd4, 1;
-; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd4;
-; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8;
+; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
+; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd3;
+; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;
; CHECK-PTX-NEXT: { // callseq 1, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0),
; CHECK-PTX-NEXT: variadics2,
@@ -380,28 +379,27 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b32 %r<3>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<10>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<9>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot7;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT: mov.u64 %rd3, __const_$_qux_$_s;
-; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [%rd3+8];
-; CHECK-PTX-NEXT: st.local.u64 [%rd2+8], %rd4;
-; CHECK-PTX-NEXT: ld.global.nc.u64 %rd5, [__const_$_qux_$_s];
-; CHECK-PTX-NEXT: st.local.u64 [%rd2], %rd5;
-; CHECK-PTX-NEXT: mov.b64 %rd6, 1;
-; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd6;
-; CHECK-PTX-NEXT: ld.local.u64 %rd7, [%rd2];
-; CHECK-PTX-NEXT: ld.local.u64 %rd8, [%rd2+8];
-; CHECK-PTX-NEXT: add.u64 %rd9, %SP, 16;
+; CHECK-PTX-NEXT: ld.global.nc.u64 %rd3, [__const_$_qux_$_s+8];
+; CHECK-PTX-NEXT: st.local.u64 [%rd2+8], %rd3;
+; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [__const_$_qux_$_s];
+; CHECK-PTX-NEXT: st.local.u64 [%rd2], %rd4;
+; CHECK-PTX-NEXT: mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT: ld.local.u64 %rd6, [%rd2];
+; CHECK-PTX-NEXT: ld.local.u64 %rd7, [%rd2+8];
+; CHECK-PTX-NEXT: add.u64 %rd8, %SP, 16;
; CHECK-PTX-NEXT: { // callseq 3, 0
; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16];
-; CHECK-PTX-NEXT: st.param.b64 [param0], %rd7;
-; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd8;
+; CHECK-PTX-NEXT: st.param.b64 [param0], %rd6;
+; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd7;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd9;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd8;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0),
; CHECK-PTX-NEXT: variadics4,
More information about the llvm-commits
mailing list