[llvm] [AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads (PR #77443)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 12 03:46:35 PST 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77443
>From 1eb9d244b34bb93a69d39237dd1234734bf04e95 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 9 Jan 2024 10:57:26 +0000
Subject: [PATCH 1/3] [AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads
---
llvm/lib/Target/AMDGPU/SMInstructions.td | 2 -
llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 8 +-
llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 2 +-
.../AMDGPU/cgp-addressing-modes-smem.ll | 36 ++++----
.../AMDGPU/divergence-driven-buildvector.ll | 32 ++++----
.../expand-scalar-carry-out-select-user.ll | 8 +-
llvm/test/CodeGen/AMDGPU/function-returns.ll | 82 +++++++++----------
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 8 +-
.../memory-legalizer-global-nontemporal.ll | 22 ++---
llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 12 +--
llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 12 +--
11 files changed, 111 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index fc29ce8d71f2c2..3c5721d1509752 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -29,7 +29,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let mayStore = 0;
let mayLoad = 1;
let hasSideEffects = 0;
- let maybeAtomic = 0;
let UseNamedOperandTable = 1;
let SchedRW = [WriteSMEM];
@@ -248,7 +247,6 @@ class SM_Atomic_Pseudo <string opName,
// Should these be set?
let ScalarStore = 1;
let hasSideEffects = 1;
- let maybeAtomic = 1;
let IsAtomicNoRet = !not(isRet);
let IsAtomicRet = isRet;
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b3165f57546fb..0d00b3dd2f92d2 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -366,10 +366,10 @@ define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32
; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
%ptr = load volatile ptr addrspace(4), ptr addrspace(4) %ptr.ptr
%addrspacecast = addrspacecast ptr addrspace(4) %ptr to ptr addrspace(6)
@@ -381,10 +381,10 @@ define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspac
; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
define amdgpu_kernel void @use_global_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) %ptr.ptr
%addrspacecast = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(6)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 2f637df4e93022..ec37af30229219 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -382,7 +382,7 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
; GCN-NEXT: s_cbranch_vccnz .LBB7_2
; GCN-NEXT: ; %bb.1: ; %bb1
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 glc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 3
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index f5846c3d6db737..fcd14f12841d51 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -11,7 +11,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX67-NEXT: .LBB0_1: ; %loop
; GFX67-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: s_load_dword s3, s[0:1], 0x64
+; GFX67-NEXT: s_load_dword s3, s[0:1], 0x64 glc
; GFX67-NEXT: s_add_i32 s2, s2, -1
; GFX67-NEXT: s_cmp_lg_u32 s2, 0
; GFX67-NEXT: s_cbranch_scc1 .LBB0_1
@@ -23,7 +23,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX89-NEXT: .LBB0_1: ; %loop
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_load_dword s3, s[0:1], 0x190
+; GFX89-NEXT: s_load_dword s3, s[0:1], 0x190 glc
; GFX89-NEXT: s_add_i32 s2, s2, -1
; GFX89-NEXT: s_cmp_lg_u32 s2, 0
; GFX89-NEXT: s_cbranch_scc1 .LBB0_1
@@ -35,7 +35,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
; GFX12-NEXT: .LBB0_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x190 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -65,7 +65,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX6-NEXT: .LBB1_1: ; %loop
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX6-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX6-NEXT: s_add_i32 s2, s2, -1
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_cbranch_scc1 .LBB1_1
@@ -77,7 +77,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: .LBB1_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s3, s[0:1], 0x3e8
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0x3e8 glc
; GFX7-NEXT: s_add_i32 s2, s2, -1
; GFX7-NEXT: s_cmp_lg_u32 s2, 0
; GFX7-NEXT: s_cbranch_scc1 .LBB1_1
@@ -89,7 +89,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX89-NEXT: .LBB1_1: ; %loop
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_load_dword s3, s[0:1], 0xfa0
+; GFX89-NEXT: s_load_dword s3, s[0:1], 0xfa0 glc
; GFX89-NEXT: s_add_i32 s2, s2, -1
; GFX89-NEXT: s_cmp_lg_u32 s2, 0
; GFX89-NEXT: s_cbranch_scc1 .LBB1_1
@@ -101,7 +101,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
; GFX12-NEXT: .LBB1_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0xfa0 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -131,7 +131,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX689-NEXT: .LBB2_1: ; %loop
; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX689-NEXT: s_waitcnt lgkmcnt(0)
-; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX689-NEXT: s_add_i32 s2, s2, -1
; GFX689-NEXT: s_cmp_lg_u32 s2, 0
; GFX689-NEXT: s_cbranch_scc1 .LBB2_1
@@ -143,7 +143,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX7-NEXT: .LBB2_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s3, s[0:1], 0xf4240
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0xf4240 glc
; GFX7-NEXT: s_add_i32 s2, s2, -1
; GFX7-NEXT: s_cmp_lg_u32 s2, 0
; GFX7-NEXT: s_cbranch_scc1 .LBB2_1
@@ -155,7 +155,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
; GFX12-NEXT: .LBB2_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x3d0900 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -185,7 +185,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
; GFX689-NEXT: .LBB3_1: ; %loop
; GFX689-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX689-NEXT: s_waitcnt lgkmcnt(0)
-; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX689-NEXT: s_add_i32 s2, s2, -1
; GFX689-NEXT: s_cmp_lg_u32 s2, 0
; GFX689-NEXT: s_cbranch_scc1 .LBB3_1
@@ -197,7 +197,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
; GFX7-NEXT: .LBB3_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s3, s[0:1], 0x989680
+; GFX7-NEXT: s_load_dword s3, s[0:1], 0x989680 glc
; GFX7-NEXT: s_add_i32 s2, s2, -1
; GFX7-NEXT: s_cmp_lg_u32 s2, 0
; GFX7-NEXT: s_cbranch_scc1 .LBB3_1
@@ -210,7 +210,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
; GFX12-NEXT: .LBB3_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -240,7 +240,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
; GFX6789-NEXT: .LBB4_1: ; %loop
; GFX6789-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6789-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6789-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX6789-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX6789-NEXT: s_add_i32 s2, s2, -1
; GFX6789-NEXT: s_cmp_lg_u32 s2, 0
; GFX6789-NEXT: s_cbranch_scc1 .LBB4_1
@@ -256,7 +256,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
; GFX12-NEXT: .LBB4_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
@@ -286,7 +286,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX678-NEXT: .LBB5_1: ; %loop
; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
-; GFX678-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX678-NEXT: s_load_dword s3, s[0:1], 0x0 glc
; GFX678-NEXT: s_add_i32 s2, s2, -1
; GFX678-NEXT: s_cmp_lg_u32 s2, 0
; GFX678-NEXT: s_cbranch_scc1 .LBB5_1
@@ -298,7 +298,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX9-NEXT: .LBB5_1: ; %loop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190
+; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190 glc
; GFX9-NEXT: s_add_i32 s2, s2, -1
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
@@ -310,7 +310,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX12-NEXT: .LBB5_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index e25a4b301537f7..8ff6971a1d7da0 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -264,8 +264,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT: s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_lshl_b32 s1, s1, 16
@@ -279,8 +279,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: ;;#ASMSTART
@@ -292,8 +292,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX906-NEXT: ;;#ASMSTART
@@ -305,8 +305,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 glc dlc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -548,8 +548,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT: s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_lshl_b32 s1, s1, 16
@@ -563,8 +563,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: ;;#ASMSTART
@@ -576,8 +576,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX906-NEXT: ;;#ASMSTART
@@ -589,8 +589,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 glc dlc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index c744ace37a8315..e3d594f978f668 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -9,7 +9,7 @@ define i32 @s_add_co_select_user() {
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s6, s6
; GFX7-NEXT: s_or_b32 s4, s4, s5
@@ -28,7 +28,7 @@ define i32 @s_add_co_select_user() {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], s6, s6
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -46,7 +46,7 @@ define i32 @s_add_co_select_user() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b64 s[4:5], 0
-; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 glc dlc
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, s5, s4, s4
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
@@ -63,7 +63,7 @@ define i32 @s_add_co_select_user() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee27981710..35bafd8d7df87c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -564,7 +564,7 @@ define <8 x i32> @v8i32_func_void() #0 {
; GFX789-LABEL: v8i32_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -576,7 +576,7 @@ define <8 x i32> @v8i32_func_void() #0 {
; GFX11-LABEL: v8i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -594,7 +594,7 @@ define <16 x i32> @v16i32_func_void() #0 {
; GFX789-LABEL: v16i32_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -608,7 +608,7 @@ define <16 x i32> @v16i32_func_void() #0 {
; GFX11-LABEL: v16i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,7 +628,7 @@ define <32 x i32> @v32i32_func_void() #0 {
; GFX789-LABEL: v32i32_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -646,7 +646,7 @@ define <32 x i32> @v32i32_func_void() #0 {
; GFX11-LABEL: v32i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -692,7 +692,7 @@ define <3 x i64> @v3i64_func_void() #0 {
; GFX789-LABEL: v3i64_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -704,7 +704,7 @@ define <3 x i64> @v3i64_func_void() #0 {
; GFX11-LABEL: v3i64_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -722,7 +722,7 @@ define <4 x i64> @v4i64_func_void() #0 {
; GFX789-LABEL: v4i64_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -734,7 +734,7 @@ define <4 x i64> @v4i64_func_void() #0 {
; GFX11-LABEL: v4i64_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -752,7 +752,7 @@ define <5 x i64> @v5i64_func_void() #0 {
; GFX789-LABEL: v5i64_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -765,7 +765,7 @@ define <5 x i64> @v5i64_func_void() #0 {
; GFX11-LABEL: v5i64_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -784,7 +784,7 @@ define <8 x i64> @v8i64_func_void() #0 {
; GFX789-LABEL: v8i64_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -798,7 +798,7 @@ define <8 x i64> @v8i64_func_void() #0 {
; GFX11-LABEL: v8i64_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -818,7 +818,7 @@ define <16 x i64> @v16i64_func_void() #0 {
; GFX789-LABEL: v16i64_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -836,7 +836,7 @@ define <16 x i64> @v16i64_func_void() #0 {
; GFX11-LABEL: v16i64_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1000,7 +1000,7 @@ define <5 x i16> @v5i16_func_void() #0 {
; CI-LABEL: v5i16_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1017,7 +1017,7 @@ define <5 x i16> @v5i16_func_void() #0 {
; GFX89-LABEL: v5i16_func_void:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
@@ -1028,7 +1028,7 @@ define <5 x i16> @v5i16_func_void() #0 {
; GFX11-LABEL: v5i16_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1044,7 +1044,7 @@ define <8 x i16> @v8i16_func_void() #0 {
; CI-LABEL: v8i16_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1063,7 +1063,7 @@ define <8 x i16> @v8i16_func_void() #0 {
; GFX89-LABEL: v8i16_func_void:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
@@ -1074,7 +1074,7 @@ define <8 x i16> @v8i16_func_void() #0 {
; GFX11-LABEL: v8i16_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1090,7 +1090,7 @@ define <16 x i16> @v16i16_func_void() #0 {
; CI-LABEL: v16i16_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1119,7 +1119,7 @@ define <16 x i16> @v16i16_func_void() #0 {
; GFX89-LABEL: v16i16_func_void:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
@@ -1131,7 +1131,7 @@ define <16 x i16> @v16i16_func_void() #0 {
; GFX11-LABEL: v16i16_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1150,7 +1150,7 @@ define <16 x i8> @v16i8_func_void() #0 {
; GFX789-LABEL: v16i8_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -1179,7 +1179,7 @@ define <16 x i8> @v16i8_func_void() #0 {
; GFX11-LABEL: v16i8_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1212,7 +1212,7 @@ define <4 x i8> @v4i8_func_void() #0 {
; GFX789-LABEL: v4i8_func_void:
; GFX789: ; %bb.0:
; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX789-NEXT: s_mov_b32 s7, 0xf000
; GFX789-NEXT: s_mov_b32 s6, -1
; GFX789-NEXT: s_waitcnt lgkmcnt(0)
@@ -1226,7 +1226,7 @@ define <4 x i8> @v4i8_func_void() #0 {
; GFX11-LABEL: v4i8_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1310,7 +1310,7 @@ define <33 x i32> @v33i32_func_void() #0 {
; CI-LABEL: v33i32_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0
@@ -1400,7 +1400,7 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX8-LABEL: v33i32_func_void:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0
@@ -1490,7 +1490,7 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX9-LABEL: v33i32_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1551,7 +1551,7 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX11-LABEL: v33i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1599,7 +1599,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; CI-LABEL: struct_v32i32_i32_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0
@@ -1689,7 +1689,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX8-LABEL: struct_v32i32_i32_func_void:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0
@@ -1779,7 +1779,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX9-LABEL: struct_v32i32_i32_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1840,7 +1840,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX11-LABEL: struct_v32i32_i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1888,7 +1888,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; CI-LABEL: struct_i32_v32i32_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1978,7 +1978,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX8-LABEL: struct_i32_v32i32_func_void:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,7 +2068,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX9-LABEL: struct_i32_v32i32_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2129,7 +2129,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX11-LABEL: struct_i32_v32i32_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 47f7943e076a4a..a1d0738a988f73 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1328,7 +1328,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 glc
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, s4, 4
@@ -1345,7 +1345,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[6:7], 0x0
+; VI-NEXT: s_load_dword s4, s[6:7], 0x0 glc
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1364,7 +1364,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s4, s[6:7], 0x0
+; CI-NEXT: s_load_dword s4, s[6:7], 0x0 glc
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1384,7 +1384,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 glc dlc
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s3, s4, 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 16e336f772df30..11692429ca7482 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 glc slc
; GFX6-NEXT: s_mov_b32 s4, s2
; GFX6-NEXT: s_mov_b32 s5, s3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -30,7 +30,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 glc slc
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,7 +43,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 slc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
@@ -54,7 +54,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 slc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
@@ -66,7 +66,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 glc slc
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
@@ -79,7 +79,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 glc slc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
@@ -90,7 +90,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 glc slc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
@@ -101,7 +101,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 nt
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
@@ -112,7 +112,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 nt
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 slc dlc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
@@ -134,7 +134,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 slc dlc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index e21b93a386c3e7..3c670f14edfc77 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -9,8 +9,8 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: ;;#ASMSTART
@@ -22,8 +22,8 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
@@ -37,8 +37,8 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 4b21493bd7ca66..537f49e10ff178 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -9,8 +9,8 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 glc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: ;;#ASMSTART
@@ -22,8 +22,8 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4)
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0
+; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_and_b32 s0, s0, 0xffff
; GFX803-NEXT: s_lshl_b32 s1, s1, 16
@@ -37,8 +37,8 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 glc
+; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
>From ff80210da92f2a2c6667178b1e1aa18bf037b0a8 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 11 Jan 2024 16:36:06 +0000
Subject: [PATCH 2/3] Start updating memory model
---
llvm/docs/AMDGPUUsage.rst | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e05f7fc3e76627..35a53be4d943f3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5813,6 +5813,18 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
be reordered by
hardware.
+ load *none* *none* - constant - !volatile & !nontemporal
+
+ 1. s_load/s_buffer_load
+
+ - !volatile & nontemporal
+
+ 1. s_load/s_buffer_load glc=1 slc=1
+
+ - volatile
+
+ 1. s_load/s_buffer_load glc=1
+
load *none* *none* - local 1. ds_load
store *none* *none* - global - !volatile & !nontemporal
- generic
>From 9fd24cf5183cb6fcf2f5a45116668d316583b6b4 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 12 Jan 2024 11:46:14 +0000
Subject: [PATCH 3/3] Document s_waitcnt lgkmcnt(0) after volatile s_load
---
llvm/docs/AMDGPUUsage.rst | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 35a53be4d943f3..8af3ea9a9aeb77 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5803,7 +5803,7 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
- Must happen before
any following volatile
- global/generic
+ global/generic/private/constant
load/store.
- Ensures that
volatile
@@ -5824,6 +5824,19 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
- volatile
1. s_load/s_buffer_load glc=1
+ 2. s_waitcnt lgkmcnt(0)
+
+ - Must happen before
+ any following volatile
+ global/generic/private/constant
+ load/store.
+ - Ensures that
+ volatile
+ operations to
+ different
+ addresses will not
+ be reordered by
+ hardware.
load *none* *none* - local 1. ds_load
store *none* *none* - global - !volatile & !nontemporal
More information about the llvm-commits
mailing list