[llvm] [AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads (PR #77443)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 12 03:46:35 PST 2024


https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77443

>From 1eb9d244b34bb93a69d39237dd1234734bf04e95 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 9 Jan 2024 10:57:26 +0000
Subject: [PATCH 1/3] [AMDGPU] Set glc/slc on volatile/nontemporal SMEM loads

---
 llvm/lib/Target/AMDGPU/SMInstructions.td      |  2 -
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |  8 +-
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |  2 +-
 .../AMDGPU/cgp-addressing-modes-smem.ll       | 36 ++++----
 .../AMDGPU/divergence-driven-buildvector.ll   | 32 ++++----
 .../expand-scalar-carry-out-select-user.ll    |  8 +-
 llvm/test/CodeGen/AMDGPU/function-returns.ll  | 82 +++++++++----------
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  8 +-
 .../memory-legalizer-global-nontemporal.ll    | 22 ++---
 llvm/test/CodeGen/AMDGPU/pack.v2f16.ll        | 12 +--
 llvm/test/CodeGen/AMDGPU/pack.v2i16.ll        | 12 +--
 11 files changed, 111 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index fc29ce8d71f2c2..3c5721d1509752 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -29,7 +29,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
   let mayStore = 0;
   let mayLoad = 1;
   let hasSideEffects = 0;
-  let maybeAtomic = 0;
   let UseNamedOperandTable = 1;
   let SchedRW = [WriteSMEM];
 
@@ -248,7 +247,6 @@ class SM_Atomic_Pseudo <string opName,
   // Should these be set?
   let ScalarStore = 1;
   let hasSideEffects = 1;
-  let maybeAtomic = 1;
 
   let IsAtomicNoRet = !not(isRet);
   let IsAtomicRet = isRet;
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b3165f57546fb..0d00b3dd2f92d2 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -366,10 +366,10 @@ define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32
 ; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
 define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
   %ptr = load volatile ptr addrspace(4), ptr addrspace(4) %ptr.ptr
   %addrspacecast = addrspacecast ptr addrspace(4) %ptr to ptr addrspace(6)
@@ -381,10 +381,10 @@ define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(ptr addrspac
 ; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0 glc{{$}}
 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0 glc{{$}}
 define amdgpu_kernel void @use_global_to_constant32_addrspacecast(ptr addrspace(4) %ptr.ptr, i32 %offset) #0 {
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) %ptr.ptr
   %addrspacecast = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(6)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 2f637df4e93022..ec37af30229219 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -382,7 +382,7 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GCN-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GCN-NEXT:    s_cbranch_vccnz .LBB7_2
 ; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 3
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index f5846c3d6db737..fcd14f12841d51 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -11,7 +11,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX67-NEXT:  .LBB0_1: ; %loop
 ; GFX67-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    s_load_dword s3, s[0:1], 0x64
+; GFX67-NEXT:    s_load_dword s3, s[0:1], 0x64 glc
 ; GFX67-NEXT:    s_add_i32 s2, s2, -1
 ; GFX67-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX67-NEXT:    s_cbranch_scc1 .LBB0_1
@@ -23,7 +23,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX89-NEXT:  .LBB0_1: ; %loop
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_load_dword s3, s[0:1], 0x190
+; GFX89-NEXT:    s_load_dword s3, s[0:1], 0x190 glc
 ; GFX89-NEXT:    s_add_i32 s2, s2, -1
 ; GFX89-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX89-NEXT:    s_cbranch_scc1 .LBB0_1
@@ -35,7 +35,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX12-NEXT:  .LBB0_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x190
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x190 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -65,7 +65,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX6-NEXT:  .LBB1_1: ; %loop
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX6-NEXT:    s_add_i32 s2, s2, -1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -77,7 +77,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX7-NEXT:  .LBB1_1: ; %loop
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x3e8
+; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x3e8 glc
 ; GFX7-NEXT:    s_add_i32 s2, s2, -1
 ; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -89,7 +89,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX89-NEXT:  .LBB1_1: ; %loop
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_load_dword s3, s[0:1], 0xfa0
+; GFX89-NEXT:    s_load_dword s3, s[0:1], 0xfa0 glc
 ; GFX89-NEXT:    s_add_i32 s2, s2, -1
 ; GFX89-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX89-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -101,7 +101,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX12-NEXT:  .LBB1_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0xfa0
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0xfa0 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -131,7 +131,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX689-NEXT:  .LBB2_1: ; %loop
 ; GFX689-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX689-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX689-NEXT:    s_add_i32 s2, s2, -1
 ; GFX689-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX689-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -143,7 +143,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX7-NEXT:  .LBB2_1: ; %loop
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0xf4240
+; GFX7-NEXT:    s_load_dword s3, s[0:1], 0xf4240 glc
 ; GFX7-NEXT:    s_add_i32 s2, s2, -1
 ; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -155,7 +155,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX12-NEXT:  .LBB2_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x3d0900
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x3d0900 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -185,7 +185,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX689-NEXT:  .LBB3_1: ; %loop
 ; GFX689-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX689-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX689-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX689-NEXT:    s_add_i32 s2, s2, -1
 ; GFX689-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX689-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -197,7 +197,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX7-NEXT:  .LBB3_1: ; %loop
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x989680
+; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x989680 glc
 ; GFX7-NEXT:    s_add_i32 s2, s2, -1
 ; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -210,7 +210,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX12-NEXT:  .LBB3_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -240,7 +240,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
 ; GFX6789-NEXT:  .LBB4_1: ; %loop
 ; GFX6789-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6789-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6789-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX6789-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX6789-NEXT:    s_add_i32 s2, s2, -1
 ; GFX6789-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6789-NEXT:    s_cbranch_scc1 .LBB4_1
@@ -256,7 +256,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
 ; GFX12-NEXT:  .LBB4_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
@@ -286,7 +286,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX678-NEXT:  .LBB5_1: ; %loop
 ; GFX678-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX678-NEXT:    s_load_dword s3, s[0:1], 0x0 glc
 ; GFX678-NEXT:    s_add_i32 s2, s2, -1
 ; GFX678-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX678-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -298,7 +298,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX9-NEXT:  .LBB5_1: ; %loop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[0:1], -0x190
+; GFX9-NEXT:    s_load_dword s3, s[0:1], -0x190 glc
 ; GFX9-NEXT:    s_add_i32 s2, s2, -1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -310,7 +310,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX12-NEXT:  .LBB5_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index e25a4b301537f7..8ff6971a1d7da0 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -264,8 +264,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
@@ -279,8 +279,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -292,8 +292,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX906-NEXT:    ;;#ASMSTART
@@ -305,8 +305,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0 glc dlc
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -548,8 +548,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
@@ -563,8 +563,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -576,8 +576,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX906-NEXT:    ;;#ASMSTART
@@ -589,8 +589,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0 glc dlc
+; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0 glc dlc
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index c744ace37a8315..e3d594f978f668 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -9,7 +9,7 @@ define i32 @s_add_co_select_user() {
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0 glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], s6, s6
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
@@ -28,7 +28,7 @@ define i32 @s_add_co_select_user() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], s6, s6
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
@@ -46,7 +46,7 @@ define i32 @s_add_co_select_user() {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 0
-; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0 glc dlc
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, s4, s4
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
@@ -63,7 +63,7 @@ define i32 @s_add_co_select_user() {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s1, s0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee27981710..35bafd8d7df87c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -564,7 +564,7 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GFX789-LABEL: v8i32_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -576,7 +576,7 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GFX11-LABEL: v8i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -594,7 +594,7 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GFX789-LABEL: v16i32_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -608,7 +608,7 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GFX11-LABEL: v16i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -628,7 +628,7 @@ define <32 x i32> @v32i32_func_void() #0 {
 ; GFX789-LABEL: v32i32_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -646,7 +646,7 @@ define <32 x i32> @v32i32_func_void() #0 {
 ; GFX11-LABEL: v32i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -692,7 +692,7 @@ define <3 x i64> @v3i64_func_void() #0 {
 ; GFX789-LABEL: v3i64_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -704,7 +704,7 @@ define <3 x i64> @v3i64_func_void() #0 {
 ; GFX11-LABEL: v3i64_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -722,7 +722,7 @@ define <4 x i64> @v4i64_func_void() #0 {
 ; GFX789-LABEL: v4i64_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -734,7 +734,7 @@ define <4 x i64> @v4i64_func_void() #0 {
 ; GFX11-LABEL: v4i64_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -752,7 +752,7 @@ define <5 x i64> @v5i64_func_void() #0 {
 ; GFX789-LABEL: v5i64_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -765,7 +765,7 @@ define <5 x i64> @v5i64_func_void() #0 {
 ; GFX11-LABEL: v5i64_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -784,7 +784,7 @@ define <8 x i64> @v8i64_func_void() #0 {
 ; GFX789-LABEL: v8i64_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -798,7 +798,7 @@ define <8 x i64> @v8i64_func_void() #0 {
 ; GFX11-LABEL: v8i64_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -818,7 +818,7 @@ define <16 x i64> @v16i64_func_void() #0 {
 ; GFX789-LABEL: v16i64_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -836,7 +836,7 @@ define <16 x i64> @v16i64_func_void() #0 {
 ; GFX11-LABEL: v16i64_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1000,7 +1000,7 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; CI-LABEL: v5i16_func_void:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1017,7 +1017,7 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; GFX89-LABEL: v5i16_func_void:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1028,7 +1028,7 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; GFX11-LABEL: v5i16_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1044,7 +1044,7 @@ define <8 x i16> @v8i16_func_void() #0 {
 ; CI-LABEL: v8i16_func_void:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1063,7 +1063,7 @@ define <8 x i16> @v8i16_func_void() #0 {
 ; GFX89-LABEL: v8i16_func_void:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1074,7 +1074,7 @@ define <8 x i16> @v8i16_func_void() #0 {
 ; GFX11-LABEL: v8i16_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1090,7 +1090,7 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; CI-LABEL: v16i16_func_void:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1119,7 +1119,7 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; GFX89-LABEL: v16i16_func_void:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1131,7 +1131,7 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; GFX11-LABEL: v16i16_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1150,7 +1150,7 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GFX789-LABEL: v16i8_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1179,7 +1179,7 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GFX11-LABEL: v16i8_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1212,7 +1212,7 @@ define <4  x i8> @v4i8_func_void() #0 {
 ; GFX789-LABEL: v4i8_func_void:
 ; GFX789:       ; %bb.0:
 ; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1226,7 +1226,7 @@ define <4  x i8> @v4i8_func_void() #0 {
 ; GFX11-LABEL: v4i8_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1310,7 +1310,7 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; CI-LABEL: v33i32_func_void:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    v_add_i32_e32 v34, vcc, 0x80, v0
@@ -1400,7 +1400,7 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX8-LABEL: v33i32_func_void:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_add_u32_e32 v34, vcc, 0x80, v0
@@ -1490,7 +1490,7 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX9-LABEL: v33i32_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1551,7 +1551,7 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-LABEL: v33i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1599,7 +1599,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; CI-LABEL: struct_v32i32_i32_func_void:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    v_add_i32_e32 v34, vcc, 0x80, v0
@@ -1689,7 +1689,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX8-LABEL: struct_v32i32_i32_func_void:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    v_add_u32_e32 v34, vcc, 0x80, v0
@@ -1779,7 +1779,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX9-LABEL: struct_v32i32_i32_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1840,7 +1840,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-LABEL: struct_v32i32_i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1888,7 +1888,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; CI-LABEL: struct_i32_v32i32_func_void:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1978,7 +1978,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX8-LABEL: struct_i32_v32i32_func_void:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2068,7 +2068,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX9-LABEL: struct_i32_v32i32_func_void:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0 glc
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2129,7 +2129,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-LABEL: struct_i32_v32i32_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 glc dlc
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 47f7943e076a4a..a1d0738a988f73 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1328,7 +1328,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0 glc
 ; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s2, s4, 4
@@ -1345,7 +1345,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0 glc
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1364,7 +1364,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; CI-NEXT:    s_load_dword s4, s[6:7], 0x0 glc
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1384,7 +1384,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0 glc dlc
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 16e336f772df30..11692429ca7482 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0 glc slc
 ; GFX6-NEXT:    s_mov_b32 s4, s2
 ; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -30,7 +30,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0 glc slc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -43,7 +43,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0 slc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -54,7 +54,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0 slc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -66,7 +66,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -79,7 +79,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0 glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -90,7 +90,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0 glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -101,7 +101,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0 nt
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
@@ -112,7 +112,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0 nt
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0 slc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
@@ -134,7 +134,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0 slc dlc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index e21b93a386c3e7..3c670f14edfc77 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -9,8 +9,8 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -22,8 +22,8 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -37,8 +37,8 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4)
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 4b21493bd7ca66..537f49e10ff178 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -9,8 +9,8 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0 glc
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0 glc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -22,8 +22,8 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4)
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX803-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GFX803-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX803-NEXT:    s_lshl_b32 s1, s1, 16
@@ -37,8 +37,8 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4)
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0 glc
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0 glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16

>From ff80210da92f2a2c6667178b1e1aa18bf037b0a8 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 11 Jan 2024 16:36:06 +0000
Subject: [PATCH 2/3] Start updating memory model

---
 llvm/docs/AMDGPUUsage.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e05f7fc3e76627..35a53be4d943f3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5813,6 +5813,18 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                               be reordered by
                                                               hardware.
 
+     load         *none*       *none*         - constant - !volatile & !nontemporal
+
+                                                           1. s_load/s_buffer_load
+
+                                                         - !volatile & nontemporal
+
+                                                           1. s_load/s_buffer_load glc=1 slc=1
+
+                                                         - volatile
+
+                                                           1. s_load/s_buffer_load glc=1
+
      load         *none*       *none*         - local    1. ds_load
      store        *none*       *none*         - global   - !volatile & !nontemporal
                                               - generic

>From 9fd24cf5183cb6fcf2f5a45116668d316583b6b4 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 12 Jan 2024 11:46:14 +0000
Subject: [PATCH 3/3] Document s_waitcnt lgkmcnt(0) after volatile s_load

---
 llvm/docs/AMDGPUUsage.rst | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 35a53be4d943f3..8af3ea9a9aeb77 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5803,7 +5803,7 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
 
                                                             - Must happen before
                                                               any following volatile
-                                                              global/generic
+                                                              global/generic/private/constant
                                                               load/store.
                                                             - Ensures that
                                                               volatile
@@ -5824,6 +5824,19 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                          - volatile
 
                                                            1. s_load/s_buffer_load glc=1
+                                                           2. s_waitcnt lgkmcnt(0)
+
+                                                            - Must happen before
+                                                              any following volatile
+                                                              global/generic/private/constant
+                                                              load/store.
+                                                            - Ensures that
+                                                              volatile
+                                                              operations to
+                                                              different
+                                                              addresses will not
+                                                              be reordered by
+                                                              hardware.
 
      load         *none*       *none*         - local    1. ds_load
      store        *none*       *none*         - global   - !volatile & !nontemporal



More information about the llvm-commits mailing list