[llvm] [AMDGPU] Do not widen scalar loads on GFX12 (PR #78724)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 19 07:10:11 PST 2024


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/78724

GFX12 has subword scalar loads so there is no need to do this.


>From 990675b6b7948deca26d8654557d138b05790ceb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 20 Jul 2023 13:46:25 +0100
Subject: [PATCH 1/2] Add GFX12 test coverage for AMDGPULateCodeGenPrepare

---
 .../AMDGPU/amdgpu-late-codegenprepare.ll      |   1 +
 .../AMDGPU/indirect-call-known-callees.ll     | 121 ++++++++++++------
 2 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
index 0de0ac7b77a777..251a2ffab40989 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s
 
 ; Make sure we don't crash when trying to create a bitcast between
 ; address spaces
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index fe7323eeadf8ab..2f876d13ff0812 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
 
 ; We have an indirect call with a known set of callees, which are
 ; known to not need any special inputs. The ABI still needs to use the
@@ -8,35 +9,61 @@
 ; FIXME: Passing real values for workitem ID, and 0s that can be undef
 
 define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
-; CHECK-LABEL: indirect_call_known_no_special_inputs:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    s_load_dword s7, s[4:5], 0x0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
-; CHECK-NEXT:    s_getpc_b64 s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s8, snork at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
-; CHECK-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_and_b32 s4, 1, s7
-; CHECK-NEXT:    s_cmp_eq_u32 s4, 1
-; CHECK-NEXT:    v_mov_b32_e32 v31, v0
-; CHECK-NEXT:    s_cselect_b32 s5, s13, s11
-; CHECK-NEXT:    s_cselect_b32 s4, s12, s10
-; CHECK-NEXT:    s_mov_b32 s12, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CHECK-NEXT:    s_endpgm
+; GFX9-LABEL: indirect_call_known_no_special_inputs:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, snork at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_cselect_b32 s5, s13, s11
+; GFX9-NEXT:    s_cselect_b32 s4, s12, s10
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: indirect_call_known_no_special_inputs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_add_co_u32 s2, s2, snork at gotpcrel32@lo+4
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, snork at gotpcrel32@hi+12
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_add_co_u32 s4, s4, wobble at gotpcrel32@lo+4
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, wobble at gotpcrel32@hi+12
+; GFX12-NEXT:    s_load_b32 s6, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
+; GFX12-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-NEXT:    s_mov_b32 s12, ttmp9
+; GFX12-NEXT:    s_mov_b64 s[8:9], 0
+; GFX12-NEXT:    s_mov_b32 s32, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_b32 s4, 1, s6
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s4, 1
+; GFX12-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    s_endpgm
 
 bb:
   %cond = load i1, ptr addrspace(4) null
@@ -46,19 +73,37 @@ bb:
 }
 
 define void @wobble() {
-; CHECK-LABEL: wobble:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: wobble:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: wobble:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   ret void
 }
 
 define void @snork() {
-; CHECK-LABEL: snork:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: snork:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: snork:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   ret void
 }

>From 6151b06f92e2f516d13bf9ed5e56319ad9ecadaa Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 20 Jul 2023 13:54:11 +0100
Subject: [PATCH 2/2] [AMDGPU] Do not widen scalar loads on GFX12

GFX12 has subword scalar loads so there is no need to do this.
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 10 ++-
 .../AMDGPU/amdgpu-late-codegenprepare.ll      | 85 ++++++++++++-------
 .../AMDGPU/indirect-call-known-callees.ll     | 30 ++++---
 3 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 1983e9f8d4af71..69fdeaebe0a018 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/InitializePasses.h"
@@ -58,6 +60,7 @@ class AMDGPULateCodeGenPrepare
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<UniformityInfoWrapperPass>();
     AU.setPreservesAll();
@@ -90,7 +93,11 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
+  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  if (ST.hasScalarSubwordLoads())
+    return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
@@ -181,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
 
 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
index 251a2ffab40989..83016f1d2d3c85 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
@@ -1,16 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX9
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX12
 
 ; Make sure we don't crash when trying to create a bitcast between
 ; address spaces
 define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
-; CHECK-LABEL: @constant_from_offset_cast_generic_null(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_generic_null(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_generic_null(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -18,12 +23,17 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
 }
 
 define amdgpu_kernel void @constant_from_offset_cast_global_null() {
-; CHECK-LABEL: @constant_from_offset_cast_global_null(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_global_null(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_global_null(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -33,12 +43,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_null() {
 @gv = unnamed_addr addrspace(1) global [64 x i8] undef, align 4
 
 define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
-; CHECK-LABEL: @constant_from_offset_cast_global_gv(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_global_gv(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_global_gv(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -46,12 +61,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
 }
 
 define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
-; CHECK-LABEL: @constant_from_offset_cast_generic_inttoptr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_generic_inttoptr(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_generic_inttoptr(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -59,10 +79,15 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
 }
 
 define amdgpu_kernel void @constant_from_inttoptr() {
-; CHECK-LABEL: @constant_from_inttoptr(
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
-; CHECK-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_inttoptr(
+; GFX9-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
+; GFX9-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_inttoptr(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
   store i8 %load, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 2f876d13ff0812..9965d214cc9b3f 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -41,28 +41,30 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ;
 ; GFX12-LABEL: indirect_call_known_no_special_inputs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_getpc_b64 s[2:3]
-; GFX12-NEXT:    s_add_co_u32 s2, s2, snork at gotpcrel32@lo+4
-; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, snork at gotpcrel32@hi+12
-; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_getpc_b64 s[4:5]
-; GFX12-NEXT:    s_add_co_u32 s4, s4, wobble at gotpcrel32@lo+4
-; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, wobble at gotpcrel32@hi+12
-; GFX12-NEXT:    s_load_b32 s6, s[0:1], 0x0
-; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, snork at gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, snork at gotpcrel32@hi+16
+; GFX12-NEXT:    s_mov_b64 s[2:3], 0
+; GFX12-NEXT:    s_getpc_b64 s[6:7]
+; GFX12-NEXT:    s_sext_i32_i16 s7, s7
+; GFX12-NEXT:    s_add_co_u32 s6, s6, wobble at gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s7, s7, wobble at gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_u8 s1, s[2:3], 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v31, v0
-; GFX12-NEXT:    s_mov_b32 s12, ttmp9
 ; GFX12-NEXT:    s_mov_b64 s[8:9], 0
+; GFX12-NEXT:    s_mov_b32 s12, s0
 ; GFX12-NEXT:    s_mov_b32 s32, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_and_b32 s4, 1, s6
+; GFX12-NEXT:    s_and_b32 s1, 1, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s4, 1
-; GFX12-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX12-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX12-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX12-NEXT:    s_endpgm
 
 bb:



More information about the llvm-commits mailing list