[llvm] 89e91e4 - [AMDGPU] Remove post-PromoteAlloca SROA run
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 10 23:29:26 PDT 2023
Author: pvanhout
Date: 2023-08-11T08:29:21+02:00
New Revision: 89e91e4c0c60d7f4d6fc729752fd25524c510a07
URL: https://github.com/llvm/llvm-project/commit/89e91e4c0c60d7f4d6fc729752fd25524c510a07
DIFF: https://github.com/llvm/llvm-project/commit/89e91e4c0c60d7f4d6fc729752fd25524c510a07.diff
LOG: [AMDGPU] Remove post-PromoteAlloca SROA run
PromoteAlloca now uses SSAUpdater, it doesn't need SROA to clean-up after it anymore.
Internal testing shows no noticeable performance impact.
Reviewed By: #amdgpu, arsenm
Differential Revision: https://reviews.llvm.org/D156398
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
llvm/test/CodeGen/AMDGPU/extload-private.ll
llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
llvm/test/CodeGen/AMDGPU/ipra.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/load-hi16.ll
llvm/test/CodeGen/AMDGPU/load-lo16.ll
llvm/test/CodeGen/AMDGPU/nested-calls.ll
llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
llvm/test/CodeGen/AMDGPU/sibling-call.ll
llvm/test/CodeGen/AMDGPU/store-hi16.ll
llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll
llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected
llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 78c6f2784316dd..43c9f183f17ba0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -173,12 +173,6 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
}
-static cl::opt<bool> EnableSROA(
- "amdgpu-sroa",
- cl::desc("Run SROA after promote alloca pass"),
- cl::ReallyHidden,
- cl::init(true));
-
static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
@@ -1011,8 +1005,6 @@ void AMDGPUPassConfig::addIRPasses() {
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createAMDGPUPromoteAlloca());
- if (EnableSROA)
- addPass(createSROAPass());
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index 7653a53e82131b..448ad96fafd608 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; This is a copy of sibling-call.ll, but stops after the IRTranslator.
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 38fd778fba6813..f9b44f45be3083 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}store_fi_lifetime:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 13b4b4786b9403..0e61547c27b453 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -2,10 +2,10 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index b97361f016934e..40bf38d5a4e88a 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}load_i8_sext_private:
; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 0f230e5703eeff..028f32844576f6 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
; Test that non-entry function frame indices are expanded properly to
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 92a728ed26729f..07693371278701 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Kernels are not called, so there is no call preserved mask.
; GCN-LABEL: {{^}}kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index cbdc1630935ff3..cf986e7d314a20 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -189,7 +189,6 @@
; GCN-O1-NEXT: Expand Atomic instructions
; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
-; GCN-O1-NEXT: SROA
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR optimizations
@@ -461,7 +460,6 @@
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: SROA
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
@@ -755,7 +753,6 @@
; GCN-O2-NEXT: Expand Atomic instructions
; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
-; GCN-O2-NEXT: SROA
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
@@ -1059,7 +1056,6 @@
; GCN-O3-NEXT: Expand Atomic instructions
; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
-; GCN-O3-NEXT: SROA
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O3-NEXT: Scalar Evolution Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index aa034cc185b474..ba025a2202313f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
-; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %in) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
@@ -2705,4 +2705,3 @@ entry:
}
attributes #0 = { nounwind }
-
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 0c8209baf09cde..5843ac77baa961 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s
-; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
define <2 x i16> @load_local_lo_v2i16_undeflo(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_lo_v2i16_undeflo:
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 7b6ba788fd3ed0..3b87231426ea70 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Test calls when called by other callable functions rather than
; kernels.
diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index 5af9f78b471c5e..115a9a888525dc 100644
--- a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck %s
;
; CFG flattening should use parallel-and mode to generate branch conditions and
; then merge if-regions with the same bodies.
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 7252aa6120cab4..9b52695fefb722 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -63,8 +63,8 @@
; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 }
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
-; REMARK-NEXT: - String: ' Dynamic Stack:
-; REMARK-NEXT: - DynamicStack: 'False'
+; REMARK-NEXT: - String: ' Dynamic Stack:
+; REMARK-NEXT: - DynamicStack: 'False'
; REMARK-NEXT: ..
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -179,7 +179,7 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32
; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 64
+; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
@@ -187,7 +187,7 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0
declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture readonly, i8, i64, i1 immarg)
-
+
define amdgpu_kernel void @test_indirect_w_static_stack() !dbg !10 {
%alloca = alloca <10 x i64>, align 16, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 40, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index a45f6ed9ef0452..7a01679f9972c7 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
target datalayout = "A5"
; FIXME: Why is this commuted only sometimes?
diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
index dab50b16e2d5df..86c94ff7ff3280 100644
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s
-; RxN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s
+; RxN: llc -march=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
; GCN-LABEL: {{^}}store_global_hi_v2i16:
; GCN: s_waitcnt
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll
index 850036eb275529..8b88768f6ae7ab 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll
@@ -1,31 +1,31 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
define i64 @i64_test(i64 %i) nounwind readnone {
- %loc = alloca i64
- %j = load i64, i64 * %loc
+ %loc = alloca i64, addrspace(5)
+ %j = load i64, ptr addrspace(5) %loc
%r = add i64 %i, %j
ret i64 %r
}
define i64 @i32_test(i32 %i) nounwind readnone {
- %loc = alloca i32
- %j = load i32, i32 * %loc
+ %loc = alloca i32, addrspace(5)
+ %j = load i32, ptr addrspace(5) %loc
%r = add i32 %i, %j
%ext = zext i32 %r to i64
ret i64 %ext
}
define i64 @i16_test(i16 %i) nounwind readnone {
- %loc = alloca i16
- %j = load i16, i16 * %loc
+ %loc = alloca i16, addrspace(5)
+ %j = load i16, ptr addrspace(5) %loc
%r = add i16 %i, %j
%ext = zext i16 %r to i64
ret i64 %ext
}
define i64 @i8_test(i8 %i) nounwind readnone {
- %loc = alloca i8
- %j = load i8, i8 * %loc
+ %loc = alloca i8, addrspace(5)
+ %j = load i8, ptr addrspace(5) %loc
%r = add i8 %i, %j
%ext = zext i8 %r to i64
ret i64 %ext
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected
index 990ec36d2dd6d5..53a164ed717593 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected
@@ -5,10 +5,15 @@ define i64 @i64_test(i64 %i) nounwind readnone {
; CHECK-LABEL: i64_test:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %loc = alloca i64
- %j = load i64, i64 * %loc
+ %loc = alloca i64, addrspace(5)
+ %j = load i64, ptr addrspace(5) %loc
%r = add i64 %i, %j
ret i64 %r
}
@@ -17,11 +22,13 @@ define i64 @i32_test(i32 %i) nounwind readnone {
; CHECK-LABEL: i32_test:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %loc = alloca i32
- %j = load i32, i32 * %loc
+ %loc = alloca i32, addrspace(5)
+ %j = load i32, ptr addrspace(5) %loc
%r = add i32 %i, %j
%ext = zext i32 %r to i64
ret i64 %ext
@@ -31,11 +38,14 @@ define i64 @i16_test(i16 %i) nounwind readnone {
; CHECK-LABEL: i16_test:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_ushort v1, off, s[0:3], s32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %loc = alloca i16
- %j = load i16, i16 * %loc
+ %loc = alloca i16, addrspace(5)
+ %j = load i16, ptr addrspace(5) %loc
%r = add i16 %i, %j
%ext = zext i16 %r to i64
ret i64 %ext
@@ -45,11 +55,14 @@ define i64 @i8_test(i8 %i) nounwind readnone {
; CHECK-LABEL: i8_test:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_ubyte v1, off, s[0:3], s32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %loc = alloca i8
- %j = load i8, i8 * %loc
+ %loc = alloca i8, addrspace(5)
+ %j = load i8, ptr addrspace(5) %loc
%r = add i8 %i, %j
%ext = zext i8 %r to i64
ret i64 %ext
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
index 3c0c6f0de34631..d1500e002d7e92 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
@@ -70,10 +70,40 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s4, s33
+; CHECK-NEXT: s_mov_b32 s8, s33
; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_addk_i32 s32, 0x600
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: v_mov_b32_e32 v2, 3
+; CHECK-NEXT: v_mov_b32_e32 v3, 4
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; CHECK-NEXT: s_cbranch_execz .LBB0_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16
+; CHECK-NEXT: .LBB0_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB0_4
+; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_addk_i32 s32, 0xfa00
+; CHECK-NEXT: s_mov_b32 s33, s8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-LABEL: main:
@@ -84,16 +114,31 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s6, s33
; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_addk_i32 s32, 0x600
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, x at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, x at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v2, 1
+; CHECK-NEXT: v_mov_b32_e32 v3, 2
+; CHECK-NEXT: v_mov_b32_e32 v4, 3
+; CHECK-NEXT: v_mov_b32_e32 v5, 4
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16
+; CHECK-NEXT: s_addk_i32 s32, 0xfa00
; CHECK-NEXT: s_mov_b32 s33, s6
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
index 304298cd5f1c3f..deadc4adb02c5e 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
@@ -11,10 +11,40 @@ define dso_local i32 @check_boundaries() #0 {
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s4, s33
+; CHECK-NEXT: s_mov_b32 s8, s33
; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_addk_i32 s32, 0x600
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: v_mov_b32_e32 v2, 3
+; CHECK-NEXT: v_mov_b32_e32 v3, 4
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; CHECK-NEXT: s_cbranch_execz .LBB0_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16
+; CHECK-NEXT: .LBB0_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB0_4
+; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_addk_i32 s32, 0xfa00
+; CHECK-NEXT: s_mov_b32 s33, s8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%1 = alloca i32, align 4, addrspace(5)
%2 = alloca i32, align 4, addrspace(5)
@@ -61,16 +91,31 @@ define dso_local i32 @main() #0 {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s6, s33
; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_addk_i32 s32, 0x600
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, x at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, x at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v2, 1
+; CHECK-NEXT: v_mov_b32_e32 v3, 2
+; CHECK-NEXT: v_mov_b32_e32 v4, 3
+; CHECK-NEXT: v_mov_b32_e32 v5, 4
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16
+; CHECK-NEXT: s_addk_i32 s32, 0xfa00
; CHECK-NEXT: s_mov_b32 s33, s6
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index d1d722a213e300..a9ae862c622121 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -3,12 +3,20 @@
define i64 @i64_test(i64 %i) nounwind readnone {
; CHECK-LABEL: i64_test:
-; CHECK: SelectionDAG has 9 nodes:
+; CHECK: SelectionDAG has 25 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
-; CHECK-NEXT: t11: ch,glue = CopyToReg t0, Register:i32 $vgpr0, IMPLICIT_DEF:i32
-; CHECK-NEXT: t17: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t13: ch,glue = CopyToReg t11, Register:i32 $vgpr1, t17, t11:1
-; CHECK-NEXT: t14: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t13, t13:1
+; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
+; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1
+; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<53>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<53>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11>
+; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t49, t32
+; CHECK-NEXT: t23: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
+; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23
+; CHECK-NEXT: t38: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>
+; CHECK-NEXT: t18: ch,glue = CopyToReg # D:1 t16, Register:i32 $vgpr1, t38, t16:1
+; CHECK-NEXT: t19: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
; CHECK-EMPTY:
%loc = alloca i64, addrspace(5)
%j = load i64, ptr addrspace(5) %loc
@@ -18,12 +26,15 @@ define i64 @i64_test(i64 %i) nounwind readnone {
define i64 @i32_test(i32 %i) nounwind readnone {
; CHECK-LABEL: i32_test:
-; CHECK: SelectionDAG has 8 nodes:
-; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK: SelectionDAG has 15 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
-; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
-; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
-; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
+; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
+; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0>
+; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7
+; CHECK-NEXT: t22: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t14, Register:i32 $vgpr1, t22, t14:1
+; CHECK-NEXT: t17: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
; CHECK-EMPTY:
%loc = alloca i32, addrspace(5)
%j = load i32, ptr addrspace(5) %loc
@@ -34,12 +45,17 @@ define i64 @i32_test(i32 %i) nounwind readnone {
define i64 @i16_test(i16 %i) nounwind readnone {
; CHECK-LABEL: i16_test:
-; CHECK: SelectionDAG has 8 nodes:
-; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK: SelectionDAG has 18 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
-; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
-; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
-; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
+; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
+; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN<Mem:(dereferenceable load (s16) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0>
+; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535>
+; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24
+; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
+; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
+; CHECK-NEXT: t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
; CHECK-EMPTY:
%loc = alloca i16, addrspace(5)
%j = load i16, ptr addrspace(5) %loc
@@ -50,12 +66,17 @@ define i64 @i16_test(i16 %i) nounwind readnone {
define i64 @i8_test(i8 %i) nounwind readnone {
; CHECK-LABEL: i8_test:
-; CHECK: SelectionDAG has 8 nodes:
-; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK: SelectionDAG has 18 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
-; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5
-; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1
-; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1
+; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
+; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN<Mem:(dereferenceable load (s8) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0>
+; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255>
+; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24
+; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
+; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
+; CHECK-NEXT: t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
; CHECK-EMPTY:
%loc = alloca i8, addrspace(5)
%j = load i8, ptr addrspace(5) %loc
More information about the llvm-commits
mailing list