[llvm] [AMDGPU] Make `getAssumedAddrSpace` return AS1 for pointer kernel arguments (PR #137488)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Thu May 29 17:53:42 PDT 2025
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/137488
>From 62942e8741da84846db29a80c7d9cc190f8f3528 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 29 May 2025 20:53:17 -0400
Subject: [PATCH] [AMDGPU] Make `getAssumedAddrSpace` return AS1 for pointer
kernel arguments
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +
.../Transforms/IPO/AttributorAttributes.cpp | 23 ++--
.../AMDGPU/GlobalISel/fp-atomics-gfx942.ll | 16 ++-
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 58 ++++------
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 84 +++++++--------
llvm/test/CodeGen/AMDGPU/aa-as-infer.ll | 6 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 102 +++++++++---------
.../AMDGPU/llvm.amdgcn.intersect_ray.ll | 52 +++++----
.../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 88 +++++++--------
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 88 +++++++--------
.../InferAddressSpaces/AMDGPU/basic.ll | 14 ++-
.../AMDGPU/mem-intrinsics.ll | 24 +++--
.../test/Transforms/OpenMP/barrier_removal.ll | 23 ++--
.../Transforms/OpenMP/spmdization_guarding.ll | 68 +++++++-----
14 files changed, 316 insertions(+), 334 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9091fdd5c959f..1ab4458bafcc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -977,6 +977,10 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
}
unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ if (auto *Arg = dyn_cast<Argument>(V);
+ Arg && AMDGPU::isKernelCC(Arg->getParent()) && !Arg->hasByRefAttr())
+ return AMDGPUAS::GLOBAL_ADDRESS;
+
const auto *LD = dyn_cast<LoadInst>(V);
if (!LD) // TODO: Handle invariant load like constant.
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 470c5308edca4..3ce03a4b96f61 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12592,29 +12592,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
}
ChangeStatus updateImpl(Attributor &A) override {
- unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value();
uint32_t OldAddressSpace = AssumedAddressSpace;
auto CheckAddressSpace = [&](Value &Obj) {
if (isa<UndefValue>(&Obj))
return true;
- // If an argument in flat address space only has addrspace cast uses, and
- // those casts are same, then we take the dst addrspace.
if (auto *Arg = dyn_cast<Argument>(&Obj)) {
- if (Arg->getType()->getPointerAddressSpace() == FlatAS) {
- unsigned CastAddrSpace = FlatAS;
- for (auto *U : Arg->users()) {
- auto *ASCI = dyn_cast<AddrSpaceCastInst>(U);
- if (!ASCI)
- return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
- if (CastAddrSpace != FlatAS &&
- CastAddrSpace != ASCI->getDestAddressSpace())
- return false;
- CastAddrSpace = ASCI->getDestAddressSpace();
- }
- if (CastAddrSpace != FlatAS)
- return takeAddressSpace(CastAddrSpace);
- }
+ auto *TTI =
+ A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
+ *Arg->getParent());
+ unsigned AssumedAS = TTI->getAssumedAddrSpace(Arg);
+ if (AssumedAS != ~0U)
+ return takeAddressSpace(AssumedAS);
}
return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll
index 6792612ded368..acf1a754c5a61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll
@@ -1,34 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefix=GFX942
-define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
+define void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX942: ; %bb.0:
-; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: s_endpgm
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
+define void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX942: ; %bb.0:
-; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: s_endpgm
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 63009bdc2643f..a4b2d5613df60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1334,92 +1334,82 @@ main_body:
ret void
}
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
+define void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_endpgm
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX942: ; %bb.0: ; %main_body
-; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: s_endpgm
+; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
}
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
+define void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_endpgm
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX942: ; %bb.0: ; %main_body
-; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: s_endpgm
+; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
}
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
+define void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_endpgm
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX942: ; %bb.0: ; %main_body
-; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: s_endpgm
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
@@ -1506,30 +1496,26 @@ main_body:
ret double %ret
}
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
+define void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: s_endpgm
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX942: ; %bb.0: ; %main_body
-; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: s_endpgm
+; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index b905e548846ab..4196d4fb0730c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -625,7 +625,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
ret <4 x float> %r
}
-define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
@@ -740,7 +740,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
ret void
}
-define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
@@ -845,12 +845,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
@@ -862,8 +860,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
@@ -877,9 +876,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
;
; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
@@ -891,8 +888,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s6
-; GFX1013-NEXT: v_mov_b32_e32 v1, s7
+; GFX1013-NEXT: v_mov_b32_e32 v0, s0
+; GFX1013-NEXT: v_mov_b32_e32 v1, s1
+; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
@@ -906,16 +904,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_mov_b32 s5, 1.0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7
+; GFX11-NEXT: s_mov_b32 s6, 2.0
; GFX11-NEXT: s_mov_b32 s8, 0x40400000
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v9, 0xb36211c7 :: v_dual_lshlrev_b32 v2, 2, v0
; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
; GFX11-NEXT: s_mov_b32 s9, 4.0
; GFX11-NEXT: s_mov_b32 s14, 0x41000000
@@ -925,11 +920,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8
; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: v_mov_b32_e32 v1, s7
-; GFX11-NEXT: s_mov_b32 s6, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s5, 1.0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -956,12 +953,10 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
@@ -970,8 +965,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
@@ -985,9 +981,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
;
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
@@ -996,8 +990,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s6
-; GFX1013-NEXT: v_mov_b32_e32 v1, s7
+; GFX1013-NEXT: v_mov_b32_e32 v0, s0
+; GFX1013-NEXT: v_mov_b32_e32 v1, s1
+; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
@@ -1011,25 +1006,24 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_mov_b32 s5, 1.0
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s6, 2.0
; GFX11-NEXT: s_mov_b32 s8, 0x42004600
; GFX11-NEXT: s_mov_b32 s9, 0x44004700
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_mov_b32 s10, 0x45004800
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6
; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: v_mov_b32_e32 v1, s7
-; GFX11-NEXT: s_mov_b32 s6, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s5, 1.0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
index d1a6414fe49ae..cc2c80060231c 100644
--- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
@@ -246,8 +246,7 @@ define void @foo(ptr addrspace(3) %val) {
define void @kernel_argument_promotion_pattern_intra_procedure(ptr %p, i32 %val) {
; CHECK-LABEL: define void @kernel_argument_promotion_pattern_intra_procedure(
; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[P_CAST_0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
-; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[P_CAST_0]], align 4
+; CHECK-NEXT: store i32 [[VAL]], ptr [[P]], align 4
; CHECK-NEXT: ret void
;
%p.cast.0 = addrspacecast ptr %p to ptr addrspace(1)
@@ -259,8 +258,7 @@ define void @kernel_argument_promotion_pattern_intra_procedure(ptr %p, i32 %val)
define internal void @use_argument_after_promotion(ptr %p, i32 %val) {
; CHECK-LABEL: define internal void @use_argument_after_promotion(
; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
-; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT: store i32 [[VAL]], ptr [[P]], align 4
; CHECK-NEXT: ret void
;
store i32 %val, ptr %p
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index ec80efc5f0362..65ae09b8d5bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -109,108 +109,102 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
ret void
}
-define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
+; FIXME: this function had amdgpu_kernel CC, but was removed as part of PR#137488.
+; Need to add them back once we move infer address space pass to middle end.
+define void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8V4-NEXT: s_load_dword s4, s[6:7], 0x40
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
-; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
+; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
-; GFX8V4-NEXT: s_endpgm
+; GFX8V4-NEXT: s_setpc_b64 s[30:31]
;
; GFX8V5-LABEL: llvm_amdgcn_is_shared:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
-; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8V5-NEXT: s_mov_b64 s[4:5], 0xc4
+; GFX8V5-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
-; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V5-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
+; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
-; GFX8V5-NEXT: s_endpgm
+; GFX8V5-NEXT: s_setpc_b64 s[30:31]
;
; GFX9V4-LABEL: llvm_amdgcn_is_shared:
; GFX9V4: ; %bb.0:
-; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4
-; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1
-; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1
+; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
-; GFX9V4-NEXT: s_endpgm
+; GFX9V4-NEXT: s_setpc_b64 s[30:31]
;
; GFX9V5-LABEL: llvm_amdgcn_is_shared:
; GFX9V5: ; %bb.0:
-; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4
-; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1
-; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1
+; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
-; GFX9V5-NEXT: s_endpgm
+; GFX9V5-NEXT: s_setpc_b64 s[30:31]
%is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
%zext = zext i1 %is.shared to i32
store volatile i32 %zext, ptr addrspace(1) poison
ret void
}
-define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
+; FIXME: this function had amdgpu_kernel CC, but was removed as part of PR#137488.
+; Need to add them back once we move infer address space pass to middle end.
+define void @llvm_amdgcn_is_private(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8V4-NEXT: s_load_dword s4, s[6:7], 0x44
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
-; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
+; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
-; GFX8V4-NEXT: s_endpgm
+; GFX8V4-NEXT: s_setpc_b64 s[30:31]
;
; GFX8V5-LABEL: llvm_amdgcn_is_private:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
-; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8V5-NEXT: s_mov_b64 s[4:5], 0xc0
+; GFX8V5-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
-; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V5-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
+; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
-; GFX8V5-NEXT: s_endpgm
+; GFX8V5-NEXT: s_setpc_b64 s[30:31]
;
; GFX9V4-LABEL: llvm_amdgcn_is_private:
; GFX9V4: ; %bb.0:
-; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4
-; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1
-; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9V4-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX9V4-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1
+; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
-; GFX9V4-NEXT: s_endpgm
+; GFX9V4-NEXT: s_setpc_b64 s[30:31]
;
; GFX9V5-LABEL: llvm_amdgcn_is_private:
; GFX9V5: ; %bb.0:
-; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4
-; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1
-; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9V5-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX9V5-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1
+; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
-; GFX9V5-NEXT: s_endpgm
+; GFX9V5-NEXT: s_setpc_b64 s[30:31]
%is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr)
%zext = zext i1 %is.private to i32
store volatile i32 %zext, ptr addrspace(1) poison
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 5bbb0e5563cc2..81a9840140258 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -407,7 +407,7 @@ main_body:
; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.
-define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -571,7 +571,7 @@ main_body:
ret void
}
-define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -719,7 +719,7 @@ main_body:
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_clause 0x1
@@ -828,16 +828,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
;
; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0
-; GFX12-GISEL-NEXT: s_mov_b32 s4, 0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7
+; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v9, 0xb36211c7 :: v_dual_lshlrev_b32 v2, 2, v0
; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000
; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0
; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000
@@ -847,17 +844,19 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8
; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1]
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3]
@@ -880,7 +879,7 @@ main_body:
ret void
}
-define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
; GFX1013-NEXT: s_clause 0x1
@@ -983,31 +982,30 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
;
; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0
-; GFX12-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600
; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6
; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1]
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 9e1815b48abfd..dad165ff3fadf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -116,111 +116,105 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in
; select and vcc branch.
-define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
+define void @is_private_sgpr(ptr inreg %ptr) {
; SI-LABEL: is_private_sgpr:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s0, s[8:9], 0x1
-; SI-NEXT: s_load_dword s1, s[8:9], 0x32
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], 0xc0
+; SI-NEXT: s_load_dword s4, s[4:5], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_cmp_eq_u32 s0, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; SI-NEXT: s_cmp_eq_u32 s17, s4
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_cbranch_vccnz .LBB1_2
; SI-NEXT: ; %bb.1: ; %bb0
-; SI-NEXT: s_mov_b32 s3, 0x100f000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s7, 0x100f000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: .LBB1_2: ; %bb1
-; SI-NEXT: s_endpgm
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; CI-SDAG-LABEL: is_private_sgpr:
; CI-SDAG: ; %bb.0:
-; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
-; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32
-; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-SDAG-NEXT: s_mov_b64 s[4:5], 0xc0
+; CI-SDAG-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
-; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; CI-SDAG-NEXT: s_cmp_eq_u32 s17, s4
+; CI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; CI-SDAG-NEXT: s_cbranch_vccnz .LBB1_2
; CI-SDAG-NEXT: ; %bb.1: ; %bb0
; CI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; CI-SDAG-NEXT: flat_store_dword v[0:1], v0
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
; CI-SDAG-NEXT: .LBB1_2: ; %bb1
-; CI-SDAG-NEXT: s_endpgm
+; CI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: is_private_sgpr:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4
-; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1
-; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s17, s5
+; GFX9-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1
-; GFX9-SDAG-NEXT: s_endpgm
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CI-GISEL-LABEL: is_private_sgpr:
; CI-GISEL: ; %bb.0:
-; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32
-; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-GISEL-NEXT: s_mov_b64 s[4:5], 0xc0
+; CI-GISEL-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
+; CI-GISEL-NEXT: s_cmp_lg_u32 s17, s4
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
; CI-GISEL-NEXT: ; %bb.1: ; %bb0
; CI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; CI-GISEL-NEXT: flat_store_dword v[0:1], v0
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
; CI-GISEL-NEXT: .LBB1_2: ; %bb1
-; CI-GISEL-NEXT: s_endpgm
+; CI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: is_private_sgpr:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s17, s5
; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: .LBB1_2: ; %bb1
-; GFX9-GISEL-NEXT: s_endpgm
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: is_private_sgpr:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u32 s1, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: s_cmp_lg_u32 s17, s5
; GFX10-NEXT: s_cbranch_scc1 .LBB1_2
; GFX10-NEXT: ; %bb.1: ; %bb0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: .LBB1_2: ; %bb1
-; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: is_private_sgpr:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, s3
; GFX11-NEXT: s_cbranch_scc1 .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
@@ -228,7 +222,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
-; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index c364c391559ea..cf8608fa23d30 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -149,7 +149,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in
; select and vcc branch.
-define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
+define void @is_local_sgpr(ptr inreg %ptr) {
; CIT-LABEL: is_local_sgpr:
; CIT: ; %bb.0:
; CIT-NEXT: s_load_dword s0, s[6:7], 0x1
@@ -186,108 +186,102 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
;
; SI-LABEL: is_local_sgpr:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s0, s[8:9], 0x1
-; SI-NEXT: s_load_dword s1, s[8:9], 0x33
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], 0xc4
+; SI-NEXT: s_load_dword s4, s[4:5], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_cmp_eq_u32 s0, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; SI-NEXT: s_cmp_eq_u32 s17, s4
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_cbranch_vccnz .LBB1_2
; SI-NEXT: ; %bb.1: ; %bb0
-; SI-NEXT: s_mov_b32 s3, 0x100f000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s7, 0x100f000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: .LBB1_2: ; %bb1
-; SI-NEXT: s_endpgm
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; CI-SDAG-LABEL: is_local_sgpr:
; CI-SDAG: ; %bb.0:
-; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
-; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33
-; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-SDAG-NEXT: s_mov_b64 s[4:5], 0xc4
+; CI-SDAG-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
-; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; CI-SDAG-NEXT: s_cmp_eq_u32 s17, s4
+; CI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; CI-SDAG-NEXT: s_cbranch_vccnz .LBB1_2
; CI-SDAG-NEXT: ; %bb.1: ; %bb0
; CI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; CI-SDAG-NEXT: flat_store_dword v[0:1], v0
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
; CI-SDAG-NEXT: .LBB1_2: ; %bb1
-; CI-SDAG-NEXT: s_endpgm
+; CI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: is_local_sgpr:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4
-; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1
-; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s17, s5
+; GFX9-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1
-; GFX9-SDAG-NEXT: s_endpgm
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CI-GISEL-LABEL: is_local_sgpr:
; CI-GISEL: ; %bb.0:
-; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33
-; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-GISEL-NEXT: s_mov_b64 s[4:5], 0xc4
+; CI-GISEL-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
+; CI-GISEL-NEXT: s_cmp_lg_u32 s17, s4
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
; CI-GISEL-NEXT: ; %bb.1: ; %bb0
; CI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; CI-GISEL-NEXT: flat_store_dword v[0:1], v0
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
; CI-GISEL-NEXT: .LBB1_2: ; %bb1
-; CI-GISEL-NEXT: s_endpgm
+; CI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: is_local_sgpr:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s17, s5
; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: .LBB1_2: ; %bb1
-; GFX9-GISEL-NEXT: s_endpgm
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: is_local_sgpr:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u32 s1, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX10-NEXT: s_cmp_lg_u32 s17, s5
; GFX10-NEXT: s_cbranch_scc1 .LBB1_2
; GFX10-NEXT: ; %bb.1: ; %bb0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: .LBB1_2: ; %bb1
-; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: is_local_sgpr:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, s3
; GFX11-NEXT: s_cbranch_scc1 .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
@@ -295,7 +289,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
-; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
index 99fe986cf6378..60bb38f863e8e 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -66,7 +66,9 @@ define amdgpu_kernel void @store_global_from_flat(ptr %generic_scalar) #0 {
define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 {
; CHECK-LABEL: define amdgpu_kernel void @store_group_from_flat(
; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[TMP1]] to ptr
+; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(3)
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(3) [[_TMP0]], align 4
; CHECK-NEXT: ret void
;
@@ -78,7 +80,9 @@ define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 {
define amdgpu_kernel void @store_private_from_flat(ptr %generic_scalar) #0 {
; CHECK-LABEL: define amdgpu_kernel void @store_private_from_flat(
; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(5)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[TMP1]] to ptr
+; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(5)
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[_TMP0]], align 4
; CHECK-NEXT: ret void
;
@@ -136,8 +140,10 @@ define amdgpu_kernel void @load_store_private(ptr addrspace(5) nocapture %input,
define amdgpu_kernel void @load_store_flat(ptr nocapture %input, ptr nocapture %output) #0 {
; CHECK-LABEL: define amdgpu_kernel void @load_store_flat(
; CHECK-SAME: ptr captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[INPUT]], align 4
-; CHECK-NEXT: store i32 [[VAL]], ptr [[OUTPUT]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[TMP2]], align 4
; CHECK-NEXT: ret void
;
%val = load i32, ptr %input, align 4
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index 57453d63d7e8a..1c317786d1c20 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -48,7 +48,8 @@ define amdgpu_kernel void @memset_global_to_flat_no_md(ptr addrspace(1) %global.
define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
; CHECK-NEXT: ret void
;
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -59,7 +60,8 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest,
define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr) #0 {
; CHECK-LABEL: define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memcpy.inline.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
; CHECK-NEXT: ret void
;
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -70,7 +72,8 @@ define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr
define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrspace(3) %dest.group.ptr, ptr %src.ptr, i64 %size) #0 {
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(
; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr [[SRC_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr align 4 [[SRC_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[SRC_PTR]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[TMP1]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
; CHECK-NEXT: ret void
;
%cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr
@@ -116,7 +119,8 @@ define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(ptr addrspac
define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
; CHECK-NEXT: ret void
;
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -127,7 +131,8 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struc
define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
; CHECK-NEXT: ret void
;
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -138,8 +143,10 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr
define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest0, ptr %dest1, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
; CHECK-LABEL: define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
; CHECK-SAME: ptr [[DEST0:%.*]], ptr [[DEST1:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST0]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST0]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DEST1]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
; CHECK-NEXT: ret void
;
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
@@ -162,7 +169,8 @@ define amdgpu_kernel void @memcpy_group_flat_to_flat_self(ptr addrspace(3) %grou
define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
; CHECK-LABEL: define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.memmove.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1)
+; CHECK-NEXT: call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
; CHECK-NEXT: ret void
;
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index f662d5dd85b2b..56f730ccb4189 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -682,11 +682,18 @@ m:
}
define internal void @write_then_barrier0(ptr %p) {
-; CHECK-LABEL: define {{[^@]+}}@write_then_barrier0
-; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT: store i32 0, ptr [[P]], align 4
-; CHECK-NEXT: call void @aligned_barrier()
-; CHECK-NEXT: ret void
+; MODULE-LABEL: define {{[^@]+}}@write_then_barrier0
+; MODULE-SAME: (ptr [[P:%.*]]) {
+; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
+; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4
+; MODULE-NEXT: call void @aligned_barrier()
+; MODULE-NEXT: ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@write_then_barrier0
+; CGSCC-SAME: (ptr [[P:%.*]]) {
+; CGSCC-NEXT: store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT: call void @aligned_barrier()
+; CGSCC-NEXT: ret void
;
store i32 0, ptr %p
call void @aligned_barrier()
@@ -695,7 +702,8 @@ define internal void @write_then_barrier0(ptr %p) {
define internal void @barrier_then_write0(ptr %p) {
; MODULE-LABEL: define {{[^@]+}}@barrier_then_write0
; MODULE-SAME: (ptr [[P:%.*]]) {
-; MODULE-NEXT: store i32 0, ptr [[P]], align 4
+; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
+; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4
; MODULE-NEXT: ret void
;
; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write0
@@ -711,7 +719,8 @@ define internal void @barrier_then_write0(ptr %p) {
define internal void @barrier_then_write_then_barrier0(ptr %p) {
; MODULE-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0
; MODULE-SAME: (ptr [[P:%.*]]) {
-; MODULE-NEXT: store i32 0, ptr [[P]], align 4
+; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
+; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4
; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: ret void
;
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index 2f1aadc073142..81e11e048dfd0 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -85,8 +85,10 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
; CHECK: region.guarded:
; CHECK-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]]
-; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]]
-; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 1, ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]]
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]]
; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]]
; CHECK: region.guarded.end:
; CHECK-NEXT: br label [[REGION_BARRIER]]
@@ -107,16 +109,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]]
; CHECK-NEXT: br label [[REGION_CHECK_TID5:%.*]]
; CHECK: region.check.tid5:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]]
; CHECK: region.guarded4:
-; CHECK-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]]
+; CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 [[SUB3_I]], ptr addrspace(1) [[TMP8]], align 4, !noalias [[META7]]
; CHECK-NEXT: br label [[REGION_GUARDED_END1:%.*]]
; CHECK: region.guarded.end1:
; CHECK-NEXT: br label [[REGION_BARRIER2]]
; CHECK: region.barrier2:
-; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP4]])
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]])
; CHECK-NEXT: br label [[REGION_EXIT3]]
; CHECK: region.exit3:
; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
@@ -128,16 +131,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
; CHECK-NEXT: br label [[REGION_CHECK_TID10:%.*]]
; CHECK: region.check.tid10:
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
-; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
+; CHECK-NEXT: br i1 [[TMP10]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]]
; CHECK: region.guarded9:
-; CHECK-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]]
+; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 [[CALL_I]], ptr addrspace(1) [[TMP11]], align 4, !noalias [[META7]]
; CHECK-NEXT: br label [[REGION_GUARDED_END6:%.*]]
; CHECK: region.guarded.end6:
; CHECK-NEXT: br label [[REGION_BARRIER7]]
; CHECK: region.barrier7:
-; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]])
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP9]])
; CHECK-NEXT: br label [[REGION_EXIT8:%.*]]
; CHECK: region.exit8:
; CHECK-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
@@ -145,16 +149,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
; CHECK-NEXT: br label [[REGION_CHECK_TID15:%.*]]
; CHECK: region.check.tid15:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT: br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]]
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0
+; CHECK-NEXT: br i1 [[TMP13]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]]
; CHECK: region.guarded14:
-; CHECK-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]]
+; CHECK-NEXT: [[TMP14:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 [[CALL8_I]], ptr addrspace(1) [[TMP14]], align 4, !noalias [[META7]]
; CHECK-NEXT: br label [[REGION_GUARDED_END11:%.*]]
; CHECK: region.guarded.end11:
; CHECK-NEXT: br label [[REGION_BARRIER12]]
; CHECK: region.barrier12:
-; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]])
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP12]])
; CHECK-NEXT: br label [[REGION_EXIT13:%.*]]
; CHECK: region.exit13:
; CHECK-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
@@ -162,16 +167,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
; CHECK-NEXT: br label [[REGION_CHECK_TID20:%.*]]
; CHECK: region.check.tid20:
-; CHECK-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
-; CHECK-NEXT: br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]]
; CHECK: region.guarded19:
-; CHECK-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]]
+; CHECK-NEXT: [[TMP17:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 [[CALL11_I]], ptr addrspace(1) [[TMP17]], align 4, !noalias [[META7]]
; CHECK-NEXT: br label [[REGION_GUARDED_END16:%.*]]
; CHECK: region.guarded.end16:
; CHECK-NEXT: br label [[REGION_BARRIER17]]
; CHECK: region.barrier17:
-; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]])
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP15]])
; CHECK-NEXT: br label [[REGION_EXIT18:%.*]]
; CHECK: region.exit18:
; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
@@ -232,11 +238,13 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]]
; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-DISABLED-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT: store i32 1, ptr addrspace(1) [[TMP2]], align 4, !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32
; CHECK-DISABLED-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32
; CHECK-DISABLED-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]]
-; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP3]], align 4, !noalias [[META7]]
; CHECK-DISABLED-NEXT: call void @usei8ptr(ptr captures(none) [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]]
; CHECK-DISABLED-NEXT: br label [[FOR_COND_I:%.*]]
; CHECK-DISABLED: for.cond.i:
@@ -248,7 +256,8 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-DISABLED-NEXT: [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1
; CHECK-DISABLED-NEXT: [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64
; CHECK-DISABLED-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]]
-; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
; CHECK-DISABLED-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-DISABLED: __omp_outlined__.exit:
@@ -256,15 +265,18 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
; CHECK-DISABLED-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64
; CHECK-DISABLED-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
-; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64
; CHECK-DISABLED-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
-; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT: [[TMP6:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr addrspace(1) [[TMP6]], align 4, !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64
; CHECK-DISABLED-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
-; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT: [[TMP7:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr addrspace(1) [[TMP7]], align 4, !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
More information about the llvm-commits
mailing list