[llvm] [NVPTX] Do not mark move of global address as cheap enabling more CSE (PR #153730)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 14 19:06:17 PDT 2025
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/153730
None
>From 2020191f5c565f323e6dd41e95400c79e0db133e Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 15 Aug 2025 02:04:55 +0000
Subject: [PATCH 1/2] pre-commit tests
---
llvm/test/CodeGen/NVPTX/cse-mov-sym.ll | 60 ++++++++++++++++++++++++++
1 file changed, 60 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
diff --git a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
new file mode 100644
index 0000000000000..8fc907f87ab43
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+ at global_smem = external addrspace(3) global [0 x i8], align 16
+
+
+;; Confirm the mov.b64 of global_smem is CSE'd. We need to make things a bit
+;; complex with a loop to make this interesting.
+define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) {
+; CHECK-LABEL: test_mov_sym(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT: ld.param.b32 %r4, [test_mov_sym_param_0];
+; CHECK-NEXT: cvt.s64.s32 %rd1, %r4;
+; CHECK-NEXT: mov.b64 %rd2, global_smem;
+; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1;
+; CHECK-NEXT: ld.shared.b32 %r7, [%rd3];
+; CHECK-NEXT: not.pred %p2, %p1;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.1: // %if1.preheader
+; CHECK-NEXT: ld.param.b32 %r5, [test_mov_sym_param_1];
+; CHECK-NEXT: setp.ne.b32 %p3, %r4, %r5;
+; CHECK-NEXT: $L__BB0_2: // %if1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: @%p3 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.3: // %if2
+; CHECK-NEXT: cvt.s64.s32 %rd4, %r5;
+; CHECK-NEXT: mov.b64 %rd5, global_smem;
+; CHECK-NEXT: add.s64 %rd6, %rd5, %rd4;
+; CHECK-NEXT: ld.shared.b32 %r6, [%rd6];
+; CHECK-NEXT: add.s32 %r7, %r7, %r6;
+; CHECK-NEXT: $L__BB0_4: // %end
+; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT: ret;
+entry:
+ %gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1
+ %val = load i32, ptr addrspace(3) %gep
+ br i1 %cond, label %if1, label %end
+if1:
+ %cond2 = icmp eq i32 %offset1, %offset2
+ br i1 %cond2, label %if2, label %if1
+if2:
+ %gep2 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset2
+ %val2 = load i32, ptr addrspace(3) %gep2
+ %add = add i32 %val, %val2
+ br label %end
+end:
+ %ret = phi i32 [ %add, %if2 ], [ %val, %entry ]
+ ret i32 %ret
+}
>From 04a70580017fe7f9661d4ac429a443d64ff46542 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 15 Aug 2025 02:06:32 +0000
Subject: [PATCH 2/2] [NVPTX] Do not mark moves of global addresses as cheap,
enabling more CSE
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 25 ++++++++++++++++---------
llvm/test/CodeGen/NVPTX/cse-mov-sym.ll | 3 +--
2 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ebb5e32f5e6fc..18b16b350fa0c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1510,18 +1510,22 @@ let hasSideEffects = false in {
"mov.b64 \t$d, __local_depot$num;">;
}
-
-// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let hasSideEffects = false, isAsCheapAsAMove = true in {
- let isMoveReg = true in
+let hasSideEffects = false in {
+ let isMoveReg = true, isAsCheapAsAMove = true in
class MOVr<RegisterClass RC, string OpStr> :
BasicNVPTXInst<(outs RC:$dst), (ins RC:$src), "mov." # OpStr>;
- let isMoveImm = true in
+ let isMoveImm = true, isAsCheapAsAMove = true in
class MOVi<RegTyInfo t, string suffix> :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$src),
"mov." # suffix,
[(set t.Ty:$dst, t.ImmNode:$src)]>;
+
+ // We don't want to set isAsCheapAsAMove to true for these instructions as
+ // this would prevent CSE and resulted in regressions.
+ class MovSymInst<RegTyInfo t> :
+ BasicNVPTXInst<(outs t.RC:$dst), (ins Operand<t.Ty>:$src),
+ "mov.b" # t.Size>;
}
def MOV_B1_r : MOVr<B1, "pred">;
@@ -1539,6 +1543,9 @@ def MOV_BF16_i : MOVi<BF16RT, "b16">;
def MOV_F32_i : MOVi<F32RT, "b32">;
def MOV_F64_i : MOVi<F64RT, "b64">;
+def MOV_B32_sym : MovSymInst<I32RT>;
+def MOV_B64_sym : MovSymInst<I64RT>;
+
def to_tglobaladdr : SDNodeXForm<globaladdr, [{
return CurDAG->getTargetGlobalAddress(N->getGlobal(), SDLoc(N),
@@ -1555,11 +1562,11 @@ def to_tframeindex : SDNodeXForm<frameindex, [{
return CurDAG->getTargetFrameIndex(N->getIndex(), N->getValueType(0));
}]>;
-def : Pat<(i32 globaladdr:$dst), (MOV_B32_i (to_tglobaladdr $dst))>;
-def : Pat<(i64 globaladdr:$dst), (MOV_B64_i (to_tglobaladdr $dst))>;
+def : Pat<(i32 globaladdr:$dst), (MOV_B32_sym (to_tglobaladdr $dst))>;
+def : Pat<(i64 globaladdr:$dst), (MOV_B64_sym (to_tglobaladdr $dst))>;
-def : Pat<(i32 externalsym:$dst), (MOV_B32_i (to_texternsym $dst))>;
-def : Pat<(i64 externalsym:$dst), (MOV_B64_i (to_texternsym $dst))>;
+def : Pat<(i32 externalsym:$dst), (MOV_B32_sym (to_texternsym $dst))>;
+def : Pat<(i64 externalsym:$dst), (MOV_B64_sym (to_texternsym $dst))>;
//---- Copy Frame Index ----
def LEA_ADDRi : NVPTXInst<(outs B32:$dst), (ins ADDR:$addr),
diff --git a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
index 8fc907f87ab43..31ecce5a66b64 100644
--- a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
+++ b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll
@@ -35,8 +35,7 @@ define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) {
; CHECK-NEXT: @%p3 bra $L__BB0_2;
; CHECK-NEXT: // %bb.3: // %if2
; CHECK-NEXT: cvt.s64.s32 %rd4, %r5;
-; CHECK-NEXT: mov.b64 %rd5, global_smem;
-; CHECK-NEXT: add.s64 %rd6, %rd5, %rd4;
+; CHECK-NEXT: add.s64 %rd6, %rd2, %rd4;
; CHECK-NEXT: ld.shared.b32 %r6, [%rd6];
; CHECK-NEXT: add.s32 %r7, %r7, %r6;
; CHECK-NEXT: $L__BB0_4: // %end
More information about the llvm-commits
mailing list