[llvm] [AMDGPU] Enable vectorization of i8 values. (PR #134934)
Gheorghe-Teodor Bercea via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 08:15:32 PDT 2025
https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/134934
>From 32c7371d4115b9009902fb45c7a2238769579a8d Mon Sep 17 00:00:00 2001
From: Doru Bercea <doru.bercea at amd.com>
Date: Mon, 7 Apr 2025 11:03:20 -0400
Subject: [PATCH] Enable vectorization of i8 values.
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 37 ++-
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 14 +
llvm/test/Analysis/CostModel/AMDGPU/load.ll | 66 ++--
llvm/test/Analysis/CostModel/AMDGPU/store.ll | 66 ++--
.../SLPVectorizer/AMDGPU/vectorize-i8.ll | 309 ++++--------------
5 files changed, 186 insertions(+), 306 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..9378d6e537c42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
- : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
- : 1;
+ // For a given width return the max 0number of elements that can be combined
+ // into a wider bit value:
+ return ElemWidth == 8 ? 4
+ : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1443,3 +1446,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
}
+
+InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ TTI::OperandValueInfo OpInfo,
+ const Instruction *I) const {
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ VecTy->getElementType()->isIntegerTy(8)) {
+ return ((DL.getTypeSizeInBits(VecTy) - 1) /
+ getLoadStoreVecRegBitWidth(AddressSpace)) +
+ 1;
+ }
+ }
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+ OpInfo, I);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
+ if (VecTy->getElementType()->isIntegerTy(8)) {
+ unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+ return ((ElementCount - 1) / 4) + 1;
+ }
+ }
+ return BaseT::getNumberOfParts(Tp);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ec298c7e9631a..4b9f88d3a459a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -281,6 +281,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
+
+ /// Account for loads of i8 vector types to have reduced cost. For
+ /// example the cost of load 4 i8s values is one is the cost of loading
+ /// a single i32 value.
+ InstructionCost getMemoryOpCost(
+ unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+ const Instruction *I = nullptr) const override;
+
+ /// When counting parts on AMD GPUs, account for i8s being grouped
+ /// together under a single i32 value. Otherwise fall back to base
+ /// implementation.
+ unsigned getNumberOfParts(Type *Tp) const override;
};
} // end namespace llvm
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/load.ll b/llvm/test/Analysis/CostModel/AMDGPU/load.ll
index 3f8016178e719..6ec84bd88cd4d 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/load.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/load.ll
@@ -21,17 +21,17 @@ define void @loads_i1(i32 %arg) {
define void @loads_i8(i32 %arg) {
; GFX90A-LABEL: 'loads_i8'
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = load <2 x i8>, ptr poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = load <3 x i8>, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = load <4 x i8>, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i8>, ptr poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i8>, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = load i8, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %7 = load <3 x i8>, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = load <4 x i8>, ptr poison, align 1
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr poison, align 1
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <3 x i8>, ptr poison, align 1
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <4 x i8>, ptr poison, align 1
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = load i8, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i8>, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = load <3 x i8>, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i8>, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i8>, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = load <3 x i8>, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i8>, ptr poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
load i8, ptr poison
@@ -154,35 +154,35 @@ define void @loads_addrspace_1(i32 %arg) {
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(1) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(1) poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(1) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(1) poison, align 8
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(1) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(1) poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(1) poison, align 16
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <16 x i8>, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(1) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(1) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(1) poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(1) poison, align 32
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = load <32 x i8>, ptr addrspace(1) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(1) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(1) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(1) poison, align 8
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(1) poison, align 64
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %30 = load <64 x i8>, ptr addrspace(1) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(1) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(1) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(1) poison, align 16
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(1) poison, align 128
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = load <128 x i8>, ptr addrspace(1) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(1) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(1) poison, align 512
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -241,35 +241,35 @@ define void @loads_addrspace_3(i32 %arg) {
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(3) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(3) poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(3) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(3) poison, align 8
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(3) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(3) poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(3) poison, align 16
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = load <16 x i8>, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(3) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(3) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(3) poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(3) poison, align 32
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = load <32 x i8>, ptr addrspace(3) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(3) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(3) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(3) poison, align 8
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(3) poison, align 64
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = load <64 x i8>, ptr addrspace(3) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(3) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(3) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(3) poison, align 16
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(3) poison, align 128
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %34 = load <128 x i8>, ptr addrspace(3) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(3) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(3) poison, align 512
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -328,35 +328,35 @@ define void @loads_addrspace_5(i32 %arg) {
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(5) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(5) poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(5) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(5) poison, align 8
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = load <8 x i8>, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(5) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(5) poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(5) poison, align 16
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %22 = load <16 x i8>, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(5) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(5) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(5) poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(5) poison, align 32
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %26 = load <32 x i8>, ptr addrspace(5) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(5) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(5) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(5) poison, align 8
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(5) poison, align 64
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %30 = load <64 x i8>, ptr addrspace(5) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(5) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(5) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(5) poison, align 16
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(5) poison, align 128
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %34 = load <128 x i8>, ptr addrspace(5) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(5) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(5) poison, align 512
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/store.ll b/llvm/test/Analysis/CostModel/AMDGPU/store.ll
index 9672c3256c751..6dc4befdfbd9e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/store.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/store.ll
@@ -20,17 +20,17 @@ define void @stores_i1(i32 %arg) {
define void @stores_i8(i32 %arg) {
; GFX90A-LABEL: 'stores_i8'
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 1
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 1
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 1
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 1
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
store i8 poison, ptr poison
@@ -153,35 +153,35 @@ define void @stores_addrspace_1(i32 %arg) {
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(1) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(1) poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(1) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(1) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(1) poison, align 8
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> poison, ptr addrspace(1) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(1) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(1) poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(1) poison, align 16
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> poison, ptr addrspace(1) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(1) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(1) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(1) poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(1) poison, align 32
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i8> poison, ptr addrspace(1) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(1) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(1) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(1) poison, align 8
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(1) poison, align 64
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <64 x i8> poison, ptr addrspace(1) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(1) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(1) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(1) poison, align 16
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(1) poison, align 128
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <128 x i8> poison, ptr addrspace(1) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(1) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(1) poison, align 512
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -241,35 +241,35 @@ define void @stores_addrspace_3(i32 %arg) {
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(3) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(3) poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(3) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(3) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(3) poison, align 8
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> poison, ptr addrspace(3) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(3) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(3) poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(3) poison, align 16
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> poison, ptr addrspace(3) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(3) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(3) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(3) poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(3) poison, align 32
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> poison, ptr addrspace(3) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(3) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(3) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(3) poison, align 8
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(3) poison, align 64
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <64 x i8> poison, ptr addrspace(3) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(3) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(3) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(3) poison, align 16
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(3) poison, align 128
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <128 x i8> poison, ptr addrspace(3) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(3) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(3) poison, align 512
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -329,35 +329,35 @@ define void @stores_addrspace_5(i32 %arg) {
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(5) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(5) poison, align 2
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(5) poison, align 2
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(5) poison, align 4
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(5) poison, align 8
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> poison, ptr addrspace(5) poison, align 8
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(5) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(5) poison, align 2
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(5) poison, align 16
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <16 x i8> poison, ptr addrspace(5) poison, align 16
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(5) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(5) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(5) poison, align 4
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(5) poison, align 32
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <32 x i8> poison, ptr addrspace(5) poison, align 32
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(5) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(5) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(5) poison, align 8
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(5) poison, align 64
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <64 x i8> poison, ptr addrspace(5) poison, align 64
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(5) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(5) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(5) poison, align 16
-; GFX90A-NEXT: Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(5) poison, align 128
+; GFX90A-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <128 x i8> poison, ptr addrspace(5) poison, align 128
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(5) poison, align 256
; GFX90A-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(5) poison, align 512
; GFX90A-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
index b9b1bc1be681e..04eeb84462cd0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
@@ -504,19 +504,14 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
-; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX7-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX7-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX7-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT: [[VEC111:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX7-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -529,19 +524,14 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
-; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX8-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT: [[VEC111:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -554,19 +544,14 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
-; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX9-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[VEC111:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -606,24 +591,19 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX7-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX7-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX7-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
-; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX7-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX7-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT: [[VEC111:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
; GFX7-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
@@ -637,24 +617,19 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
-; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT: [[VEC111:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
; GFX8-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
@@ -668,24 +643,19 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX9-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
-; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[VEC111:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
; GFX9-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
@@ -732,31 +702,14 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
-; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX7-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX7-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
-; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
-; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX7-NEXT: [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX7-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX7-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT: [[VEC131:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX7-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -769,31 +722,14 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX8-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
-; GFX8-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX8-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX8-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
-; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
-; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX8-NEXT: [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT: [[VEC131:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -806,31 +742,14 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX9-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX9-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
-; GFX9-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX9-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX9-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX9-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
-; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
-; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX9-NEXT: [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX9-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[VEC131:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -882,31 +801,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0,
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
-; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX7-NEXT: store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2
-; GFX7-NEXT: store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2
-; GFX7-NEXT: store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT: store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2
-; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX7-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX7-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
+; GFX7-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -919,31 +820,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0,
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX8-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
-; GFX8-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX8-NEXT: store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2
-; GFX8-NEXT: store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2
-; GFX8-NEXT: store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT: store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2
-; GFX8-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX8-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
+; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -956,31 +839,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0,
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX9-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX9-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
-; GFX9-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX9-NEXT: store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2
-; GFX9-NEXT: store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2
-; GFX9-NEXT: store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT: store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2
-; GFX9-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX9-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX9-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
+; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -1031,10 +896,6 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
; GFX7-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
; GFX7-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
-; GFX7-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3
-; GFX7-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2
-; GFX7-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1
-; GFX7-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
@@ -1044,22 +905,12 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX7: [[EXIT]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
-; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1
-; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2
-; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3
-; GFX7-NEXT: store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1
-; GFX7-NEXT: store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT: store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1
-; GFX7-NEXT: store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
; GFX8-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
-; GFX8-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3
-; GFX8-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2
-; GFX8-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1
-; GFX8-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
@@ -1069,22 +920,12 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX8: [[EXIT]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
-; GFX8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1
-; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2
-; GFX8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3
-; GFX8-NEXT: store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1
-; GFX8-NEXT: store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT: store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1
-; GFX8-NEXT: store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1
+; GFX8-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
; GFX9-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
-; GFX9-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3
-; GFX9-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2
-; GFX9-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1
-; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
@@ -1094,13 +935,7 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX9: [[EXIT]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
-; GFX9-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1
-; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2
-; GFX9-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3
-; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1
-; GFX9-NEXT: store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT: store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1
-; GFX9-NEXT: store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1
+; GFX9-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
; GFX9-NEXT: ret void
;
entry:
More information about the llvm-commits
mailing list