[llvm] [AMDGPU] Allow SLP to analyze i8s (PR #113002)

Fri Oct 18 16:24:34 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

<details>
<summary>Changes</summary>

This is a refinement of https://github.com/llvm/llvm-project/pull/105850 and https://github.com/llvm/llvm-project/pull/95328

The fundamental benefit of vectorizing i8s is to facilitate vector i8 type coercion across basic blocks. This has a true benefit of reducing register usage and extract / repack code. That said, vectorizing i8s is not uniformly beneficial. Thus, it is important to have a precise vectorization cost model. https://github.com/llvm/llvm-project/pull/112999 teaches SLP a bit about vectorization costs when dealing with legalization concerns of illegal types, and about the benefit of type coercion. This PR enables SLP to consider i8s for vectorization. The effect of both is to vecorize i8 chains when the type coercion benefit outweighs any potential costs.

This PR is meant for commits starting at 204d518

---

Patch is 61.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113002.diff


12 Files Affected:

- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+13) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+4) 
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+4) 
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+7) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+30-3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+3) 
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+85-6) 
- (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll (+55) 
- (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll (+56) 
- (added) llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll (+358) 
- (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll (+75) 
- (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll (+327) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..934012b2e53f5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1534,6 +1534,14 @@ class TargetTransformInfo {
       Function *F, Type *RetTy, ArrayRef<Type *> Tys,
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
 
+  /// \returns The cost of propagating Type \p DataType through Basic Block /
+  /// function boundaries. If \p IsCallingConv is specified, then \p DataType is
+  /// associated with either a function argument or return. Otherwise, \p
+  /// DataType is used in either a GEP instruction, or spans across BasicBlocks
+  /// (this is relevant because SelectionDAG builder may, for example, scalarize
+  /// illegal vectors across blocks, which introduces extract/insert code).
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const;
+
   /// \returns The number of pieces into which the provided type must be
   /// split during legalization. Zero is returned when the answer is unknown.
   unsigned getNumberOfParts(Type *Tp) const;
@@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept {
   virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                            ArrayRef<Type *> Tys,
                                            TTI::TargetCostKind CostKind) = 0;
+  virtual InstructionCost getDataFlowCost(Type *DataType,
+                                          bool IsCallingConv) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
   virtual InstructionCost
   getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
@@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                    TTI::TargetCostKind CostKind) override {
     return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
   }
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override {
+    return Impl.getDataFlowCost(DataType, IsCallingConv);
+  }
   unsigned getNumberOfParts(Type *Tp) override {
     return Impl.getNumberOfParts(Tp);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..5a25a88c3eb460 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,6 +772,10 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const {
+    return 0;
+  }
+
   // Assume that we have a register of the right size for the type.
   unsigned getNumberOfParts(Type *Tp) const { return 1; }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..c6a5c38a1b3fd5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return 10;
   }
 
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) {
+    return 0;
+  }
+
   unsigned getNumberOfParts(Type *Tp) {
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
     return LT.first.isValid() ? *LT.first.getValue() : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..edef9afa747d62 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType,
+                                                     bool IsCallingConv) const {
+  InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return TTIImpl->getNumberOfParts(Tp);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..882bb5bd0e2dc1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,31 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
   return !F || !ST->isSingleLaneExecution(*F);
 }
 
+InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType,
+                                            bool IsCallingConv) {
+  if (isTypeLegal(DataType) || IsCallingConv)
+    return BaseT::getDataFlowCost(DataType, IsCallingConv);
+
+  return getNumberOfParts(DataType);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
+  // For certain 8 bit ops, we can pack a v4i8 into a single part
+  // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
+  // do not limit the numberOfParts for 8 bit vectors to the
+  // legalization costs of such. It is left up to other target
+  // queries (e.g. get*InstrCost) to decide the proper handling
+  // of 8 bit vectors.
+  if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
+    if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
+      unsigned ElCount = VTy->getElementCount().getFixedValue();
+      return ElCount / 4;
+    }
+  }
+
+  return BaseT::getNumberOfParts(Tp);
+}
+
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
   // registers. See getRegisterClassForType for the implementation.
@@ -337,9 +362,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+
+  return (ElemWidth == 8)                              ? 4
+         : (ElemWidth == 16)                           ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca2..efff73efd4b059 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
     return TTI::PSK_FastHardware;
   }
 
+  unsigned getNumberOfParts(Type *Tp);
   unsigned getNumberOfRegisters(unsigned RCID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
@@ -161,6 +162,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
 
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv);
+
   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
                                      ArrayRef<unsigned> Indices = {}) const;
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ae0819c964bef3..42617eb4cf2095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
   return ArgTys;
 }
 
+// The cost model may determine that vectorizing and eliminating a series of
+// ExtractElements is beneficial. However, if the input vector is a function
+// argument, the calling convention may require extractions in the geneerated
+// code. In this scenario, vectorizaino would then not eliminate the
+// ExtractElement sequence, but would add additional vectorization code.
+// getCCCostFromScalars does the proper accounting for this.
+static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
+                                     unsigned ScalarSize,
+                                     TargetTransformInfo *TTI) {
+  SetVector<Value *> ArgRoots;
+  for (unsigned I = 0; I < ScalarSize; I++) {
+    auto *Scalar = Scalars[I];
+    if (!Scalar)
+      continue;
+    auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+    if (!EE)
+      continue;
+
+    auto *Vec = EE->getOperand(0);
+    if (!Vec->getType()->isVectorTy())
+      continue;
+
+    auto F = EE->getFunction();
+    auto FoundIt = find_if(
+        F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
+
+    if (FoundIt == F->arg_end())
+      continue;
+
+    if (!ArgRoots.contains(Vec))
+      ArgRoots.insert(Vec);
+  }
+
+  if (!ArgRoots.size())
+    return 0;
+
+  unsigned Cost = 0;
+  for (auto ArgOp : ArgRoots) {
+    Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
+                .getValue()
+                .value_or(0);
+  }
+  return Cost;
+}
+
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+  InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
   if (E->State == TreeEntry::NeedToGather) {
     if (allConstant(VL))
-      return 0;
+      return CommonCost;
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
-    return processBuildVector<ShuffleCostEstimator, InstructionCost>(
-        E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
+    return CommonCost +
+           processBuildVector<ShuffleCostEstimator, InstructionCost>(
+               E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
   }
-  InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
+
+    // Calculate the cost difference of propagating a vector vs series of scalars
+    // across blocks. This may be nonzero in the case of illegal vectors.
+    Instruction *VL0 = TE.getMainOp();
+    bool IsAPhi = VL0 && isa<PHINode>(VL0);
+    bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
+    bool LiveThru = false;
+    if (HasNextEntry) {
+      Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
+      LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
+    }
+    if (IsAPhi || LiveThru) {
+      VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
+      Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
+      if (ScalarTy && isValidElementType(ScalarTy)) {
+        InstructionCost ScalarDFlow =
+            TTI->getDataFlowCost(ScalarTy,
+                                 /*IsCallingConv*/ false) *
+            TE.getVectorFactor();
+        InstructionCost VectorDFlow =
+            TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
+        Cost += (VectorDFlow - ScalarDFlow);
+      }
+    }
+
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
                       << shortBundleName(TE.Scalars) << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
@@ -10257,8 +10328,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
-        !ExtractCostCalculated.insert(EU.Scalar).second)
+        !ExtractCostCalculated.insert(EU.Scalar).second) {
       continue;
+    }
 
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
@@ -10266,6 +10338,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     if (EphValues.count(EU.User))
       continue;
 
+    // Account for any additional costs required by CallingConvention for the
+    // type.
+    if (isa_and_nonnull<ReturnInst>(EU.User)) {
+      Cost +=
+          TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
+      continue;
+    }
+
     // No extract cost for vector "scalar"
     if (isa<FixedVectorType>(EU.Scalar->getType()))
       continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
-
   return Cost;
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 3749bdf1bba394..373c18d0c64317 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -363,11 +363,66 @@ bb:
   ret <4 x i16> %ins.3
 }
 
+define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
+; GCN-LABEL: @uadd_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+}
+
+define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
+; GCN-LABEL: @usub_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+}
+
 declare i16 @llvm.uadd.sat.i16(i16, i16) #0
 declare i16 @llvm.usub.sat.i16(i16, i16) #0
 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
 declare i16 @llvm.ssub.sat.i16(i16, i16) #0
 
+declare i8 @llvm.uadd.sat.i8(i8, i8) #0
+declare i8 @llvm.usub.sat.i8(i8, i8) #0
+
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.usub.sat.i32(i32, i32) #0
 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 0bb641371825b5..1d18162a7960e3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -363,11 +363,67 @@ bb:
   ret <4 x i16> %ins.3
 }
 
+define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1, ptr addrspace(1) %dst) {
+; GCN-LABEL: @uadd_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+}
+define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
+; GCN-LABEL: @usub_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+
+}
+
+
 declare i16 @llvm.uadd.sat.i16(i16, i16) #0
 declare i16 @llvm.usub.sat.i16(i16, i16) #0
 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
 declare i16 @llvm.ssub.sat.i16(i16, i16) #0
 
+declare i8 @llvm.uadd.sat.i8(i8, i8) #0
+declare i8 @llvm.usub.sat.i8(i8, i8) #0
+
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.usub.sat.i32(i32, i32) #0
 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
new file mode 100644
index 00000000000000..6bdcf628f6c879
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
@@ -0,0 +1,358 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX9 %s
+
+define protected amdgpu_kernel void @phi(ptr addrspace(3) ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/113002