[llvm] 9a98ab5 - [AArch64][SVE2] Change the cost of extends with S/URHADD to 0

Tue Aug 29 05:43:59 PDT 2023

Author: Kerry McLaughlin
Date: 2023-08-29T12:24:47Z
New Revision: 9a98ab589a4fe17012b6af4fd14c0166d2d58d2b

URL: https://github.com/llvm/llvm-project/commit/9a98ab589a4fe17012b6af4fd14c0166d2d58d2b
DIFF: https://github.com/llvm/llvm-project/commit/9a98ab589a4fe17012b6af4fd14c0166d2d58d2b.diff

LOG: [AArch64][SVE2] Change the cost of extends with S/URHADD to 0

When SVE2 is enabled, we can combine an add of 1, add & shift right by 1
to a single s/urhadd instruction. If the operands to the adds are extended,
these extends will fold into the s/urhadd and their costs should be 0.

Reviewed By: david-arm, dtemirbulatov

Differential Revision: https://reviews.llvm.org/D157628

Added: 
    llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c8e0fb12b0f335..c5703b15d07d83 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2044,6 +2044,56 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
 }
 
+// s/urhadd instructions implement the following pattern, making the
+// extends free:
+//   %x = add ((zext i8 -> i16), 1)
+//   %y = (zext i8 -> i16)
+//   trunc i16 (lshr (add %x, %y), 1) -> i8
+//
+bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser,
+                                        const CastInst *Ext, Type *Dst,
+                                        Type *Src) {
+
+  // The source should be a legal vector type.
+  if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
+      (Src->isScalableTy() && !ST->hasSVE2()))
+    return false;
+
+  if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
+    return false;
+
+  // Look for trunc/shl/add before trying to match the pattern.
+  const Instruction *Add = ExtUser;
+  auto *AddUser =
+      dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
+  if (AddUser && AddUser->getOpcode() == Instruction::Add)
+    Add = AddUser;
+
+  auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
+  if (!Shr || Shr->getOpcode() != Instruction::LShr)
+    return false;
+
+  auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
+  if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
+      Src->getScalarSizeInBits() !=
+          cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
+    return false;
+
+  // Try to match the whole pattern. Ext could be either the first or second
+  // m_ZExtOrSExt matched.
+  Instruction *Ex1, *Ex2;
+  if (!(match(Add, m_c_Add(m_Instruction(Ex1),
+                           m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
+    return false;
+
+  // Ensure both extends are of the same type
+  if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
+      Ex1->getOpcode() == Ex2->getOpcode())
+    return true;
+
+  return false;
+}
+
 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                  Type *Src,
                                                  TTI::CastContextHint CCH,
@@ -2068,6 +2118,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       } else // Others are free so long as isWideningInstruction returned true.
         return 0;
     }
+
+    // The cast will be free for the s/urhadd instructions
+    if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+        isExtPartOfAvgExpr(SingleUser, cast<CastInst>(I), Dst, Src))
+      return 0;
   }
 
   // TODO: Allow non-throughput costs that aren't binary.

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 88e9bf70200399..4d24a0ef20b649 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -163,6 +163,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I = nullptr);
 
+  bool isExtPartOfAvgExpr(const Instruction *ExtUser, const CastInst *Ext,
+                          Type *Dst, Type *Src);
+
   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                    TTI::CastContextHint CCH,
                                    TTI::TargetCostKind CostKind,

diff  --git a/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
new file mode 100644
index 00000000000000..94a37d944fc221
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
@@ -0,0 +1,201 @@
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s -check-prefix=SVE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefix=SVE2
+
+; SRHADD
+
+define void @srhadd_i8_sext_i16_fixed(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'srhadd_i8_sext_i16_fixed'
+; SVE:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <16 x i8> %ld1 to <16 x i16>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <16 x i8> %ld2 to <16 x i16>
+;
+; SVE2-LABEL: 'srhadd_i8_sext_i16_fixed'
+; SVE2:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <16 x i8> %ld1 to <16 x i16>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <16 x i8> %ld2 to <16 x i16>
+;
+  %ld1 = load <16 x i8>, ptr %a
+  %ld2 = load <16 x i8>, ptr %b
+  %ext1 = sext <16 x i8> %ld1 to <16 x i16>
+  %ext2 = sext <16 x i8> %ld2 to <16 x i16>
+  %add1 = add nuw nsw <16 x i16> %ext1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 1, i64 0), <16 x i16> poison, <16 x i32> zeroinitializer)
+  %add2 = add nuw nsw <16 x i16> %add1, %ext2
+  %shr = lshr <16 x i16> %add2, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 1, i64 0), <16 x i16> poison, <16 x i32> zeroinitializer)
+  %trunc = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %trunc, ptr %a
+  ret void
+}
+
+define void @srhadd_i8_sext_i16_scalable(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'srhadd_i8_sext_i16_scalable'
+; SVE:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+;
+; SVE2-LABEL: 'srhadd_i8_sext_i16_scalable'
+; SVE2:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+;
+  %ld1 = load <vscale x 16 x i8>, ptr %a
+  %ld2 = load <vscale x 16 x i8>, ptr %b
+  %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+  %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
+  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
+  store <vscale x 16 x i8> %trunc, ptr %a
+  ret void
+}
+
+define void @srhadd_i16_sext_i64_scalable(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'srhadd_i16_sext_i64_scalable'
+; SVE:       Cost Model: Found an estimated cost of 6 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
+;
+; SVE2-LABEL: 'srhadd_i16_sext_i64_scalable'
+; SVE2:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
+;
+  %ld1 = load <vscale x 8 x i16>, ptr %a
+  %ld2 = load <vscale x 8 x i16>, ptr %b
+  %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
+  %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
+  %add1 = add nuw nsw <vscale x 8 x i64> %ext1, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 8 x i64> %add1, %ext2
+  %shr = lshr <vscale x 8 x i64> %add2, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 8 x i64> %shr to <vscale x 8 x i16>
+  store <vscale x 8 x i16> %trunc, ptr %a
+  ret void
+}
+
+; URHADD
+
+define void @urhadd_i32_zext_i64_fixed(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'urhadd_i32_zext_i64_fixed'
+; SVE:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <4 x i32> %ld1 to <4 x i64>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <4 x i32> %ld2 to <4 x i64>
+;
+; SVE2-LABEL: 'urhadd_i32_zext_i64_fixed'
+; SVE2:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <4 x i32> %ld1 to <4 x i64>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <4 x i32> %ld2 to <4 x i64>
+;
+  %ld1 = load <4 x i32>, ptr %a
+  %ld2 = load <4 x i32>, ptr %b
+  %ext1 = zext <4 x i32> %ld1 to <4 x i64>
+  %ext2 = zext <4 x i32> %ld2 to <4 x i64>
+  %add1 = add nuw nsw <4 x i64> %ext1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 1, i64 0), <4 x i64> poison, <4 x i32> zeroinitializer)
+  %add2 = add nuw nsw <4 x i64> %add1, %ext2
+  %shr = lshr <4 x i64> %add2, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 1, i64 0), <4 x i64> poison, <4 x i32> zeroinitializer)
+  %trunc = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %trunc, ptr %a
+  ret void
+}
+
+define void @urhadd_i8_zext_i64(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'urhadd_i8_zext_i64'
+; SVE:       Cost Model: Found an estimated cost of 14 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
+;
+; SVE2-LABEL: 'urhadd_i8_zext_i64'
+; SVE2:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
+;
+  %ld1 = load <vscale x 16 x i8>, ptr %a
+  %ld2 = load <vscale x 16 x i8>, ptr %b
+  %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
+  %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
+  %add1 = add nuw nsw <vscale x 16 x i64> %ext1, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 16 x i64> %add1, %ext2
+  %shr = lshr <vscale x 16 x i64> %add2, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 16 x i64> %shr to <vscale x 16 x i8>
+  store <vscale x 16 x i8> %trunc, ptr %a
+  ret void
+}
+
+define void @urhadd_i16_zext_i32(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'urhadd_i16_zext_i32'
+; SVE:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
+;
+; SVE2-LABEL: 'urhadd_i16_zext_i32'
+; SVE2:       Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
+;
+  %ld1 = load <vscale x 8 x i16>, ptr %a
+  %ld2 = load <vscale x 8 x i16>, ptr %b
+  %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
+  %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
+  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
+  %shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
+  store <vscale x 8 x i16> %trunc, ptr %a
+  ret void
+}
+
+; NEGATIVE TESTS
+
+define void @ext_operand_mismatch(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'ext_operand_mismatch'
+; SVE:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+;
+; SVE2-LABEL: 'ext_operand_mismatch'
+; SVE2:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+;
+  %ld1 = load <vscale x 16 x i8>, ptr %a
+  %ld2 = load <vscale x 16 x i8>, ptr %b
+  %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+  %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
+  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
+  store <vscale x 16 x i8> %trunc, ptr %a
+  ret void
+}
+
+define void @add_multiple_uses(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'add_multiple_uses'
+; SVE:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
+;
+; SVE2-LABEL: 'add_multiple_uses'
+; SVE2:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
+;
+  %ld1 = load <vscale x 8 x i16>, ptr %a
+  %ld2 = load <vscale x 8 x i16>, ptr %b
+  %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
+  %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
+  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
+  %shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
+  %add.res = add nuw nsw <vscale x 8 x i32> %add1, %add2
+  %res = trunc <vscale x 8 x i32> %add.res to <vscale x 8 x i16>
+  store <vscale x 8 x i16> %res, ptr %a
+  ret void
+}
+
+define void @shift_multiple_uses(ptr %a, ptr %b, ptr %dst) {
+; SVE-LABEL: 'shift_multiple_uses'
+; SVE:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+;
+; SVE2-LABEL: 'shift_multiple_uses'
+; SVE2:       Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+; SVE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+;
+  %ld1 = load <vscale x 16 x i8>, ptr %a
+  %ld2 = load <vscale x 16 x i8>, ptr %b
+  %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
+  %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
+  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
+  %add3 = add nuw nsw <vscale x 16 x i16> %shr, %add2
+  %res = trunc <vscale x 16 x i16> %add3 to <vscale x 16 x i8>
+  store <vscale x 16 x i8> %res, ptr %a
+  ret void
+}