[llvm] 16e712e - [NVPTX] Allow directly storing immediates to improve readability (#145552)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 25 18:46:42 PDT 2025
Author: Alex MacLean
Date: 2025-06-25T18:46:39-07:00
New Revision: 16e712e7c308569a8420d322a77bc696bfd3fcb6
URL: https://github.com/llvm/llvm-project/commit/16e712e7c308569a8420d322a77bc696bfd3fcb6
DIFF: https://github.com/llvm/llvm-project/commit/16e712e7c308569a8420d322a77bc696bfd3fcb6.diff
LOG: [NVPTX] Allow directly storing immediates to improve readability (#145552)
Allow directly storing an immediate instead of requiring that it first
be moved into a register. This makes for more compact and readable PTX.
An approach similar to this (using a ComplexPattern) this could be used
for most PTX instructions to avoid the need for `_[ri]+` variants and
boiler-plate.
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/test/CodeGen/NVPTX/access-non-generic.ll
llvm/test/CodeGen/NVPTX/bf16-instructions.ll
llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
llvm/test/CodeGen/NVPTX/chain-different-as.ll
llvm/test/CodeGen/NVPTX/demote-vars.ll
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
llvm/test/CodeGen/NVPTX/i1-load-lower.ll
llvm/test/CodeGen/NVPTX/i128-ld-st.ll
llvm/test/CodeGen/NVPTX/jump-table.ll
llvm/test/CodeGen/NVPTX/local-stack-frame.ll
llvm/test/CodeGen/NVPTX/lower-alloca.ll
llvm/test/CodeGen/NVPTX/lower-byval-args.ll
llvm/test/CodeGen/NVPTX/param-align.ll
llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
llvm/test/CodeGen/NVPTX/reg-types.ll
llvm/test/CodeGen/NVPTX/variadics-backend.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 61fe8a53cb63a..5ee1bee49247c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1339,20 +1339,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
SDValue Offset, Base;
SelectADDR(ST->getBasePtr(), Base, Offset);
- SDValue Ops[] = {Value,
+ SDValue Ops[] = {selectPossiblyImm(Value),
getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
- getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
getI32Imm(ToTypeWidth, DL),
Base,
Offset,
Chain};
- const MVT::SimpleValueType SourceVT =
- Value.getNode()->getSimpleValueType(0).SimpleTy;
- const std::optional<unsigned> Opcode = pickOpcodeForVT(
- SourceVT, NVPTX::ST_i8, NVPTX::ST_i16, NVPTX::ST_i32, NVPTX::ST_i64);
+ const std::optional<unsigned> Opcode =
+ pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i8,
+ NVPTX::ST_i16, NVPTX::ST_i32, NVPTX::ST_i64);
if (!Opcode)
return false;
@@ -1389,7 +1387,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
const unsigned NumElts = getLoadStoreVectorNumElts(ST);
- SmallVector<SDValue, 16> Ops(ST->ops().slice(1, NumElts));
+ SmallVector<SDValue, 16> Ops;
+ for (auto &V : ST->ops().slice(1, NumElts))
+ Ops.push_back(selectPossiblyImm(V));
SDValue Addr = N->getOperand(NumElts + 1);
const unsigned ToTypeWidth = TotalWidth / NumElts;
@@ -1400,9 +1400,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
SelectADDR(Addr, Base, Offset);
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
- getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
+ getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
+ Offset, Chain});
const MVT::SimpleValueType EltVT =
ST->getOperand(1).getSimpleValueType().SimpleTy;
@@ -2102,6 +2101,19 @@ bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
return true;
}
+SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
+ if (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ if (auto *CN = dyn_cast<ConstantSDNode>(V))
+ return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
+ V.getValueType());
+ if (auto *CN = dyn_cast<ConstantFPSDNode>(V))
+ return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
+ V.getValueType());
+ return V;
+}
+
bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
unsigned int spN) const {
const Value *Src = nullptr;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 92b5c773258ed..a785e1513682e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -105,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
}
bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset);
+ SDValue selectPossiblyImm(SDValue V);
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 1ea6d98a1df8e..cb0275706ae3e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -184,6 +184,18 @@ class OneUse2<SDPatternOperator operator>
class fpimm_pos_inf<ValueType vt>
: FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>;
+
+
+// Operands which can hold a Register or an Immediate.
+//
+// Unfortunately, since most register classes can hold multiple types, we must
+// use the 'Any' type for these.
+
+def RI1 : Operand<i1>;
+def RI16 : Operand<Any>;
+def RI32 : Operand<Any>;
+def RI64 : Operand<Any>;
+
// Utility class to wrap up information about a register and DAG type for more
// convenient iteration and parameterization
class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node,
@@ -2276,19 +2288,20 @@ let mayLoad=1, hasSideEffects=0 in {
def LD_i64 : LD<B64>;
}
-class ST<NVPTXRegClass regclass>
+class ST<DAGOperand O>
: NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
- LdStCode:$Sign, i32imm:$toWidth, ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$toWidth"
+ (ins O:$src,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$toWidth,
+ ADDR:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth"
" \t[$addr], $src;", []>;
let mayStore=1, hasSideEffects=0 in {
- def ST_i8 : ST<B16>;
- def ST_i16 : ST<B16>;
- def ST_i32 : ST<B32>;
- def ST_i64 : ST<B64>;
+ def ST_i8 : ST<RI16>;
+ def ST_i16 : ST<RI16>;
+ def ST_i32 : ST<RI32>;
+ def ST_i64 : ST<RI64>;
}
// The following is used only in and after vector elementizations. Vector
@@ -2324,38 +2337,38 @@ let mayLoad=1, hasSideEffects=0 in {
defm LDV_i64 : LD_VEC<B64>;
}
-multiclass ST_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
+multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
def _v2 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
- LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth,
+ (ins O:$src1, O:$src2,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v4 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
- LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
+ (ins O:$src1, O:$src2, O:$src3, O:$src4,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+ ADDR:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
if support_v8 then
def _v8 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- regclass:$src5, regclass:$src6, regclass:$src7, regclass:$src8,
- LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
- i32imm:$fromWidth, ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
+ (ins O:$src1, O:$src2, O:$src3, O:$src4,
+ O:$src5, O:$src6, O:$src7, O:$src8,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+ ADDR:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth "
"\t[$addr], "
"{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>;
}
let mayStore=1, hasSideEffects=0 in {
- defm STV_i8 : ST_VEC<B16>;
- defm STV_i16 : ST_VEC<B16>;
- defm STV_i32 : ST_VEC<B32, support_v8 = true>;
- defm STV_i64 : ST_VEC<B64>;
+ defm STV_i8 : ST_VEC<RI16>;
+ defm STV_i16 : ST_VEC<RI16>;
+ defm STV_i32 : ST_VEC<RI32, support_v8 = true>;
+ defm STV_i64 : ST_VEC<RI64>;
}
//---- Conversion ----
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index 9edd4de017ee2..601a35288f54d 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -107,8 +107,7 @@ define void @nested_const_expr() {
; PTX-LABEL: nested_const_expr(
; store 1 to bitcast(gep(addrspacecast(array), 0, 1))
store i32 1, ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1), align 4
-; PTX: mov.b32 %r1, 1;
-; PTX-NEXT: st.shared.b32 [array+4], %r1;
+; PTX: st.shared.b32 [array+4], 1;
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 95af9c64a73ac..0dc658757053c 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -1605,5 +1605,23 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
ret <2 x bfloat> %r
}
+define void @store_bf16(ptr %p1, ptr %p2, bfloat %v) {
+; CHECK-LABEL: store_bf16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [store_bf16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs1, [store_bf16_param_2];
+; CHECK-NEXT: st.b16 [%rd1], %rs1;
+; CHECK-NEXT: ld.param.b64 %rd2, [store_bf16_param_1];
+; CHECK-NEXT: st.b16 [%rd2], 0x3F80;
+; CHECK-NEXT: ret;
+ store bfloat %v, ptr %p1
+ store bfloat 1.0, ptr %p2
+ ret void
+}
+
declare bfloat @llvm.maximum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat>, <2 x bfloat>)
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 01e4065a7baa7..15de26ff4df30 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -723,3 +723,20 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
ret <2 x bfloat> %r
}
+define void @test_store_bf16x2(ptr %p1, ptr %p2, <2 x bfloat> %v) {
+; CHECK-LABEL: test_store_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_store_bf16x2_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_store_bf16x2_param_2];
+; CHECK-NEXT: st.b32 [%rd1], %r1;
+; CHECK-NEXT: ld.param.b64 %rd2, [test_store_bf16x2_param_1];
+; CHECK-NEXT: st.b32 [%rd2], 1065369472;
+; CHECK-NEXT: ret;
+ store <2 x bfloat> %v, ptr %p1
+ store <2 x bfloat> <bfloat 1.0, bfloat 1.0>, ptr %p2
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/chain-
diff erent-as.ll b/llvm/test/CodeGen/NVPTX/chain-
diff erent-as.ll
index f2d0d9d069ea6..a33d286b47381 100644
--- a/llvm/test/CodeGen/NVPTX/chain-
diff erent-as.ll
+++ b/llvm/test/CodeGen/NVPTX/chain-
diff erent-as.ll
@@ -4,14 +4,13 @@
define i64 @test() nounwind readnone {
; CHECK-LABEL: test(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.b64 %rd1, 1;
-; CHECK-NEXT: mov.b64 %rd2, 42;
-; CHECK-NEXT: st.b64 [%rd1], %rd2;
-; CHECK-NEXT: ld.global.b64 %rd3, [%rd1];
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.b64 [%rd1], 42;
+; CHECK-NEXT: ld.global.b64 %rd2, [%rd1];
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
; CHECK-NEXT: ret;
%addr0 = inttoptr i64 1 to ptr
%addr1 = inttoptr i64 1 to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/NVPTX/demote-vars.ll b/llvm/test/CodeGen/NVPTX/demote-vars.ll
index ab89b62b53d05..e554e4aaea36f 100644
--- a/llvm/test/CodeGen/NVPTX/demote-vars.ll
+++ b/llvm/test/CodeGen/NVPTX/demote-vars.ll
@@ -67,8 +67,7 @@ define void @define_private_global(i64 %val) {
; Also check that the if-then is still here, otherwise we may not be testing
; the "more-than-one-use" part.
; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct],
-; CHECK: mov.b64 %[[VAR:.*]], 25
-; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct], %[[VAR]]
+; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct], 25
define void @define_private_global_more_than_one_use(i64 %val, i1 %cond) {
store i64 %val, ptr addrspace(3) @private_global_used_more_than_once_in_same_fct
br i1 %cond, label %then, label %end
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 8da2c1d1ebac2..2b2fed153d823 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -2295,5 +2295,25 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
ret <2 x half> %r
}
+define void @test_store_2xhalf(ptr %p1, ptr %p2, <2 x half> %v) {
+; CHECK-LABEL: test_store_2xhalf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_store_2xhalf_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_store_2xhalf_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_store_2xhalf_param_0];
+; CHECK-NEXT: st.b32 [%rd1], %r1;
+; CHECK-NEXT: st.b32 [%rd2], 1006648320;
+; CHECK-NEXT: ret;
+ store <2 x half> %v, ptr %p1
+ store <2 x half> <half 1.0, half 1.0>, ptr %p2
+ ret void
+}
+
+
+
attributes #0 = { nounwind }
attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
index 50d39c88a46b9..5214d272e161f 100644
--- a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
@@ -10,14 +10,13 @@ target triple = "nvptx-nvidia-cuda"
define void @foo() {
; CHECK-LABEL: foo(
; CHECK: .reg .pred %p<2>;
-; CHECK: .reg .b16 %rs<4>;
+; CHECK: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK: ld.global.b8 %rs1, [i1g];
; CHECK: and.b16 %rs2, %rs1, 1;
; CHECK: setp.ne.b16 %p1, %rs2, 0;
; CHECK: @%p1 bra $L__BB0_2;
-; CHECK: mov.b16 %rs3, 1;
-; CHECK: st.global.b8 [i1g], %rs3;
+; CHECK: st.global.b8 [i1g], 1;
; CHECK: ret;
%tmp = load i1, ptr addrspace(1) @i1g, align 2
br i1 %tmp, label %if.end, label %if.then
diff --git a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
index 6bf65d4d4ad69..abe92a5bf79b9 100644
--- a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll
@@ -13,8 +13,8 @@ define i128 @foo(ptr %p, ptr %o) {
; CHECK-NEXT: ld.param.b64 %rd2, [foo_param_1];
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
; CHECK-NEXT: ld.b8 %rd3, [%rd1];
+; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, 0};
; CHECK-NEXT: mov.b64 %rd4, 0;
-; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4};
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4};
; CHECK-NEXT: ret;
%c = load i8, ptr %p, align 1
diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll
index 0718e6d603b6c..e1eeb66b5afc0 100644
--- a/llvm/test/CodeGen/NVPTX/jump-table.ll
+++ b/llvm/test/CodeGen/NVPTX/jump-table.ll
@@ -10,7 +10,7 @@ define void @foo(i32 %i) {
; CHECK-LABEL: foo(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b32 %r2, [foo_param_0];
@@ -24,20 +24,16 @@ define void @foo(i32 %i) {
; CHECK-NEXT: $L__BB0_5;
; CHECK-NEXT: brx.idx %r2, $L_brx_0;
; CHECK-NEXT: $L__BB0_2: // %case0
-; CHECK-NEXT: mov.b32 %r6, 0;
-; CHECK-NEXT: st.global.b32 [out], %r6;
+; CHECK-NEXT: st.global.b32 [out], 0;
; CHECK-NEXT: bra.uni $L__BB0_6;
; CHECK-NEXT: $L__BB0_4: // %case2
-; CHECK-NEXT: mov.b32 %r4, 2;
-; CHECK-NEXT: st.global.b32 [out], %r4;
+; CHECK-NEXT: st.global.b32 [out], 2;
; CHECK-NEXT: bra.uni $L__BB0_6;
; CHECK-NEXT: $L__BB0_5: // %case3
-; CHECK-NEXT: mov.b32 %r3, 3;
-; CHECK-NEXT: st.global.b32 [out], %r3;
+; CHECK-NEXT: st.global.b32 [out], 3;
; CHECK-NEXT: bra.uni $L__BB0_6;
; CHECK-NEXT: $L__BB0_3: // %case1
-; CHECK-NEXT: mov.b32 %r5, 1;
-; CHECK-NEXT: st.global.b32 [out], %r5;
+; CHECK-NEXT: st.global.b32 [out], 1;
; CHECK-NEXT: $L__BB0_6: // %end
; CHECK-NEXT: ret;
entry:
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index a9004d00e7807..5c3017310d0a3 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -136,7 +136,7 @@ define void @foo4() {
; PTX32-NEXT: .local .align 4 .b8 __local_depot3[8];
; PTX32-NEXT: .reg .b32 %SP;
; PTX32-NEXT: .reg .b32 %SPL;
-; PTX32-NEXT: .reg .b32 %r<6>;
+; PTX32-NEXT: .reg .b32 %r<5>;
; PTX32-EMPTY:
; PTX32-NEXT: // %bb.0:
; PTX32-NEXT: mov.b32 %SPL, __local_depot3;
@@ -145,9 +145,8 @@ define void @foo4() {
; PTX32-NEXT: add.u32 %r2, %SPL, 0;
; PTX32-NEXT: add.u32 %r3, %SP, 4;
; PTX32-NEXT: add.u32 %r4, %SPL, 4;
-; PTX32-NEXT: mov.b32 %r5, 0;
-; PTX32-NEXT: st.local.b32 [%r2], %r5;
-; PTX32-NEXT: st.local.b32 [%r4], %r5;
+; PTX32-NEXT: st.local.b32 [%r2], 0;
+; PTX32-NEXT: st.local.b32 [%r4], 0;
; PTX32-NEXT: { // callseq 1, 0
; PTX32-NEXT: .param .b32 param0;
; PTX32-NEXT: st.param.b32 [param0], %r1;
@@ -165,7 +164,6 @@ define void @foo4() {
; PTX64-NEXT: .local .align 4 .b8 __local_depot3[8];
; PTX64-NEXT: .reg .b64 %SP;
; PTX64-NEXT: .reg .b64 %SPL;
-; PTX64-NEXT: .reg .b32 %r<2>;
; PTX64-NEXT: .reg .b64 %rd<5>;
; PTX64-EMPTY:
; PTX64-NEXT: // %bb.0:
@@ -175,9 +173,8 @@ define void @foo4() {
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
; PTX64-NEXT: add.u64 %rd3, %SP, 4;
; PTX64-NEXT: add.u64 %rd4, %SPL, 4;
-; PTX64-NEXT: mov.b32 %r1, 0;
-; PTX64-NEXT: st.local.b32 [%rd2], %r1;
-; PTX64-NEXT: st.local.b32 [%rd4], %r1;
+; PTX64-NEXT: st.local.b32 [%rd2], 0;
+; PTX64-NEXT: st.local.b32 [%rd4], 0;
; PTX64-NEXT: { // callseq 1, 0
; PTX64-NEXT: .param .b64 param0;
; PTX64-NEXT: st.param.b64 [param0], %rd1;
diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
index 489bcf4a7d55c..57c1e5826c89a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
@@ -15,7 +15,7 @@ define ptx_kernel void @kernel() {
; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr %A to ptr addrspace(5)
; LOWERALLOCAONLY: [[V2:%.*]] = addrspacecast ptr addrspace(5) [[V1]] to ptr
; LOWERALLOCAONLY: store i32 0, ptr [[V2]], align 4
-; PTX: st.local.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
+; PTX: st.local.b32 [{{%rd[0-9]+}}], 0
store i32 0, ptr %A
call void @callee(ptr %A)
ret void
@@ -26,7 +26,7 @@ define void @alloca_in_explicit_local_as() {
; PTX-LABEL: .visible .func alloca_in_explicit_local_as(
%A = alloca i32, addrspace(5)
; CHECK: store i32 0, ptr addrspace(5) {{%.+}}
-; PTX: st.local.b32 [%SP], {{%r[0-9]+}}
+; PTX: st.local.b32 [%SP], 0
; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr addrspace(5) %A to ptr
; LOWERALLOCAONLY: store i32 0, ptr [[V1]], align 4
store i32 0, ptr addrspace(5) %A
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index d268562914755..4784d7093a796 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -650,7 +650,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b16 %rs<3>;
-; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %bb
@@ -666,8 +666,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
; PTX-NEXT: add.u64 %rd2, %SPL, 4;
; PTX-NEXT: add.u64 %rd4, %SPL, 0;
; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1;
-; PTX-NEXT: mov.b32 %r3, 1;
-; PTX-NEXT: st.local.b32 [%rd5], %r3;
+; PTX-NEXT: st.local.b32 [%rd5], 1;
; PTX-NEXT: ret;
bb:
%ptrnew = select i1 %cond, ptr %input1, ptr %input2
@@ -830,7 +829,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b16 %rs<3>;
-; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %bb
@@ -849,8 +848,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
; PTX-NEXT: // %bb.1: // %second
; PTX-NEXT: mov.b64 %rd6, %rd1;
; PTX-NEXT: $L__BB14_2: // %merge
-; PTX-NEXT: mov.b32 %r3, 1;
-; PTX-NEXT: st.local.b32 [%rd6], %r3;
+; PTX-NEXT: st.local.b32 [%rd6], 1;
; PTX-NEXT: ret;
bb:
br i1 %cond, label %first, label %second
diff --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll
index 16220fb4d47bb..c85080fdf295a 100644
--- a/llvm/test/CodeGen/NVPTX/param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/param-align.ll
@@ -73,12 +73,10 @@ define ptx_device void @t6() {
; CHECK-LABEL: .func check_ptr_align1(
; CHECK: ld.param.b64 %rd1, [check_ptr_align1_param_0];
; CHECK-NOT: ld.param.b8
-; CHECK: mov.b32 %r1, 0;
-; CHECK: st.b8 [%rd1+3], %r1;
-; CHECK: st.b8 [%rd1+2], %r1;
-; CHECK: st.b8 [%rd1+1], %r1;
-; CHECK: mov.b32 %r2, 1;
-; CHECK: st.b8 [%rd1], %r2;
+; CHECK: st.b8 [%rd1+3], 0;
+; CHECK: st.b8 [%rd1+2], 0;
+; CHECK: st.b8 [%rd1+1], 0;
+; CHECK: st.b8 [%rd1], 1;
; CHECK: ret;
define void @check_ptr_align1(ptr align 1 %_arg_ptr) {
entry:
@@ -89,10 +87,8 @@ entry:
; CHECK-LABEL: .func check_ptr_align2(
; CHECK: ld.param.b64 %rd1, [check_ptr_align2_param_0];
; CHECK-NOT: ld.param.b16
-; CHECK: mov.b32 %r1, 0;
-; CHECK: st.b16 [%rd1+2], %r1;
-; CHECK: mov.b32 %r2, 2;
-; CHECK: st.b16 [%rd1], %r2;
+; CHECK: st.b16 [%rd1+2], 0;
+; CHECK: st.b16 [%rd1], 2;
; CHECK: ret;
define void @check_ptr_align2(ptr align 2 %_arg_ptr) {
entry:
@@ -103,8 +99,7 @@ entry:
; CHECK-LABEL: .func check_ptr_align4(
; CHECK: ld.param.b64 %rd1, [check_ptr_align4_param_0];
; CHECK-NOT: ld.param.b32
-; CHECK: mov.b32 %r1, 4;
-; CHECK: st.b32 [%rd1], %r1;
+; CHECK: st.b32 [%rd1], 4;
; CHECK: ret;
define void @check_ptr_align4(ptr align 4 %_arg_ptr) {
entry:
@@ -114,8 +109,7 @@ entry:
; CHECK-LABEL: .func check_ptr_align8(
; CHECK: ld.param.b64 %rd1, [check_ptr_align8_param_0];
-; CHECK: mov.b32 %r1, 8;
-; CHECK: st.b32 [%rd1], %r1;
+; CHECK: st.b32 [%rd1], 8;
; CHECK: ret;
define void @check_ptr_align8(ptr align 8 %_arg_ptr) {
entry:
diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
index b6f1964c54c76..cd2505c20d39c 100644
--- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -4,10 +4,8 @@
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
define ptx_kernel void @t1(ptr %a) {
-; PTX32: mov.b16 %rs{{[0-9]+}}, 0;
-; PTX32-NEXT: st.global.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
-; PTX64: mov.b16 %rs{{[0-9]+}}, 0;
-; PTX64-NEXT: st.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX32: st.global.b8 [%r{{[0-9]+}}], 0;
+; PTX64: st.global.b8 [%rd{{[0-9]+}}], 0;
store i1 false, ptr %a
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll
index fb065e1b01bbe..ea45bfdc5e190 100644
--- a/llvm/test/CodeGen/NVPTX/reg-types.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-types.ll
@@ -6,64 +6,47 @@
; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; CHECK-LABEL: .visible .func func()
-; NO8BIT-LABEL: .visible .func func()
-define void @func() {
-entry:
- %s8 = alloca i8, align 1
- %u8 = alloca i8, align 1
- %s16 = alloca i16, align 2
- %u16 = alloca i16, align 2
-; Both 8- and 16-bit integers are packed into 16-bit registers.
-; CHECK-DAG: .reg .b16 %rs<
-; We should not generate 8-bit registers.
+; CHECK-LABEL: .visible .func func(
+; NO8BIT-LABEL: .visible .func func(
+define void @func(ptr %p, i1 %cond) {
+; Both 8- and 16-bit integers are packed into 16-bit registers. So we should
+; not generate 8-bit registers.
; NO8BIT-NOT: .reg .{{[bsu]}}8
- %s32 = alloca i32, align 4
- %u32 = alloca i32, align 4
+
+; CHECK-DAG: .reg .pred %p<
+; CHECK-DAG: .reg .b16 %rs<
; CHECK-DAG: .reg .b32 %r<
- %s64 = alloca i64, align 8
- %u64 = alloca i64, align 8
; CHECK-DAG: .reg .b64 %rd<
- %f32 = alloca float, align 4
- %f64 = alloca double, align 8
-; Verify that we use correct register types.
- store i8 1, ptr %s8, align 1
-; CHECK: mov.b16 [[R1:%rs[0-9]]], 1;
-; CHECK-NEXT: st.b8 {{.*}}, [[R1]]
- store i8 2, ptr %u8, align 1
-; CHECK: mov.b16 [[R2:%rs[0-9]]], 2;
-; CHECK-NEXT: st.b8 {{.*}}, [[R2]]
- store i16 3, ptr %s16, align 2
-; CHECK: mov.b16 [[R3:%rs[0-9]]], 3;
-; CHECK-NEXT: st.b16 {{.*}}, [[R3]]
- store i16 4, ptr %u16, align 2
-; CHECK: mov.b16 [[R4:%rs[0-9]]], 4;
-; CHECK-NEXT: st.b16 {{.*}}, [[R4]]
- store i32 5, ptr %s32, align 4
-; CHECK: mov.b32 [[R5:%r[0-9]]], 5;
-; CHECK-NEXT: st.b32 {{.*}}, [[R5]]
- store i32 6, ptr %u32, align 4
-; CHECK: mov.b32 [[R6:%r[0-9]]], 6;
-; CHECK-NEXT: st.b32 {{.*}}, [[R6]]
- store i64 7, ptr %s64, align 8
-; CHECK: mov.b64 [[R7:%rd[0-9]]], 7;
-; CHECK-NEXT: st.b64 {{.*}}, [[R7]]
- store i64 8, ptr %u64, align 8
-; CHECK: mov.b64 [[R8:%rd[0-9]]], 8;
-; CHECK-NEXT: st.b64 {{.*}}, [[R8]]
+entry:
+ br i1 %cond, label %if, label %join
+if:
+ br label %join
+join:
+ ; CHECK-DAG: mov.pred %p{{[0-9]+}}, %p{{[0-9]+}};
+ ; CHECK-DAG: mov.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}};
+ ; CHECK-DAG: mov.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}};
+ ; CHECK-DAG: mov.b32 %r{{[0-9]+}}, %r{{[0-9]+}};
+ ; CHECK-DAG: mov.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+
+ ; CHECK-DAG: mov.b32 %r{{[0-9]+}}, %r{{[0-9]+}};
+ ; CHECK-DAG: mov.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
+
+ %v1 = phi i1 [ true, %if ], [ false, %entry ]
+ %v8 = phi i8 [ 1, %if ], [ 0, %entry ]
+ %v16 = phi i16 [ 2, %if ], [ 0, %entry ]
+ %v32 = phi i32 [ 3, %if ], [ 0, %entry ]
+ %v64 = phi i64 [ 4, %if ], [ 0, %entry ]
+ %f32 = phi float [ 5.0, %if ], [ 0.0, %entry ]
+ %f64 = phi double [ 6.0, %if ], [ 0.0, %entry ]
-; FP constants are stored via integer registers, but that's an
-; implementation detail that's irrelevant here.
- store float 9.000000e+00, ptr %f32, align 4
- store double 1.000000e+01, ptr %f64, align 8
-; Instead, we force a load into a register and then verify register type.
- %f32v = load volatile float, ptr %f32, align 4
-; CHECK: ld.volatile.b32 %r{{[0-9]+}}
- %f64v = load volatile double, ptr %f64, align 8
-; CHECK: ld.volatile.b64 %rd{{[0-9]+}}
+ store volatile i1 %v1, ptr %p, align 1
+ store volatile i8 %v8, ptr %p, align 1
+ store volatile i16 %v16, ptr %p, align 2
+ store volatile i32 %v32, ptr %p, align 4
+ store volatile i64 %v64, ptr %p, align 8
+ store volatile float %f32, ptr %p, align 4
+ store volatile double %f64, ptr %p, align 8
ret void
-; CHECK: ret;
-; NO8BIT: ret;
}
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index ca1b722527a89..ab9202650577a 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -104,32 +104,28 @@ define dso_local i32 @foo() {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
-; CHECK-PTX-NEXT: .reg .b32 %r<4>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
+; CHECK-PTX-NEXT: .reg .b32 %r<3>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT: mov.b64 %rd1, 4294967297;
-; CHECK-PTX-NEXT: st.b64 [%SP], %rd1;
-; CHECK-PTX-NEXT: mov.b32 %r1, 1;
-; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
-; CHECK-PTX-NEXT: mov.b64 %rd2, 1;
-; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd2;
-; CHECK-PTX-NEXT: mov.b64 %rd3, 4607182418800017408;
-; CHECK-PTX-NEXT: st.b64 [%SP+24], %rd3;
-; CHECK-PTX-NEXT: st.b64 [%SP+32], %rd3;
-; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 0;
+; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297;
+; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
+; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
+; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408;
+; CHECK-PTX-NEXT: st.b64 [%SP+32], 4607182418800017408;
+; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
; CHECK-PTX-NEXT: { // callseq 0, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0), variadics1, (param0, param1);
-; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0];
+; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 0
-; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:
%conv = sext i8 1 to i32
@@ -206,9 +202,9 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
-; CHECK-PTX-NEXT: .reg .b16 %rs<5>;
-; CHECK-PTX-NEXT: .reg .b32 %r<4>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
+; CHECK-PTX-NEXT: .reg .b16 %rs<4>;
+; CHECK-PTX-NEXT: .reg .b32 %r<3>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot3;
@@ -220,23 +216,20 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
-; CHECK-PTX-NEXT: mov.b32 %r1, 1;
-; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
-; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
-; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4;
-; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
-; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3;
-; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;
+; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
+; CHECK-PTX-NEXT: st.b8 [%SP+12], 1;
+; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
+; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8;
; CHECK-PTX-NEXT: { // callseq 1, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1);
-; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0];
+; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 1
-; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:
%s1.sroa.3 = alloca [3 x i8], align 1
@@ -290,14 +283,13 @@ define dso_local i32 @baz() {
; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot5[16];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
-; CHECK-PTX-NEXT: .reg .b32 %r<4>;
+; CHECK-PTX-NEXT: .reg .b32 %r<3>;
; CHECK-PTX-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot5;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT: mov.b32 %r1, 1;
-; CHECK-PTX-NEXT: st.v4.b32 [%SP], {%r1, %r1, %r1, %r1};
+; CHECK-PTX-NEXT: st.v4.b32 [%SP], {1, 1, 1, 1};
; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
; CHECK-PTX-NEXT: { // callseq 2, 0
; CHECK-PTX-NEXT: .param .b32 param0;
@@ -306,9 +298,9 @@ define dso_local i32 @baz() {
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0), variadics3, (param0, param1);
-; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0];
+; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 2
-; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:
%call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef <i32 1, i32 1, i32 1, i32 1>)
@@ -360,7 +352,7 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b32 %r<3>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<9>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot7;
@@ -370,17 +362,16 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: st.local.b64 [%rd2+8], %rd3;
; CHECK-PTX-NEXT: ld.global.nc.b64 %rd4, [__const_$_qux_$_s];
; CHECK-PTX-NEXT: st.local.b64 [%rd2], %rd4;
-; CHECK-PTX-NEXT: mov.b64 %rd5, 1;
-; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd5;
-; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2];
-; CHECK-PTX-NEXT: ld.local.b64 %rd7, [%rd2+8];
-; CHECK-PTX-NEXT: add.u64 %rd8, %SP, 16;
+; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
+; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd2];
+; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8];
+; CHECK-PTX-NEXT: add.u64 %rd7, %SP, 16;
; CHECK-PTX-NEXT: { // callseq 3, 0
; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16];
-; CHECK-PTX-NEXT: st.param.b64 [param0], %rd6;
-; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd7;
+; CHECK-PTX-NEXT: st.param.b64 [param0], %rd5;
+; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6;
; CHECK-PTX-NEXT: .param .b64 param1;
-; CHECK-PTX-NEXT: st.param.b64 [param1], %rd8;
+; CHECK-PTX-NEXT: st.param.b64 [param1], %rd7;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
More information about the llvm-commits
mailing list