[llvm] 81a11da - [CGP, AArch64] Replace zexts with shuffle that can be lowered using tbl.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 15 11:18:50 PDT 2022


Author: Florian Hahn
Date: 2022-09-15T19:18:13+01:00
New Revision: 81a11da762577b000e615a874dc56eb927ff2c1b

URL: https://github.com/llvm/llvm-project/commit/81a11da762577b000e615a874dc56eb927ff2c1b
DIFF: https://github.com/llvm/llvm-project/commit/81a11da762577b000e615a874dc56eb927ff2c1b.diff

LOG: [CGP,AArch64] Replace zexts with shuffle that can be lowered using tbl.

This patch extends CodeGenPrepare to lower zext v16i8 -> v16i32 in loops
using a wide shuffle  creating a v64i8 vector, selecting groups of 3
zero elements and an element from the input.

This is profitable on AArch64 where such shuffles can be lowered to tbl
instructions, but only in loops, because it requires materializing 4
masks, which can be done in the loop preheader.

This is the only reason the transform is part of CGP. If there's a
better alternative I missed, please let me know. The same goes for the
shouldReplaceZExtWithShuffle hook which guards this. I am not sure if
this transform will be beneficial on other targets, but it seems like
there is no way other convenient way.

This improves the generated code for loops like the one below in
combination with D96522.

    int foo(uint8_t *p, int N) {
      unsigned long long sum = 0;
      for (int i = 0; i < N ; i++, p++) {
	unsigned int v = *p;
	sum += (v < 127) ? v : 256 - v;
      }
      return sum;
    }

https://clang.godbolt.org/z/Wco866MjY

Reviewed By: t.p.northover

Differential Revision: https://reviews.llvm.org/D120571

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/test/CodeGen/AArch64/vselect-ext.ll
    llvm/test/CodeGen/AArch64/zext-to-tbl.ll
    llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3495e3c78a81e..7f6580d05e516 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -67,6 +67,7 @@ class Constant;
 class FastISel;
 class FunctionLoweringInfo;
 class GlobalValue;
+class Loop;
 class GISelKnownBits;
 class IntrinsicInst;
 class IRBuilderBase;
@@ -2798,6 +2799,13 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Try to optimize extending or truncating conversion instructions (like
+  /// zext, trunc, fptoui, uitofp) for the target.
+  virtual bool optimizeExtendOrTruncateConversion(Instruction *I,
+                                                  Loop *L) const {
+    return false;
+  }
+
   /// Return true if the target supplies and combines to a paired load
   /// two loaded values of type LoadedType next to each other in memory.
   /// RequiredAlignment gives the minimal alignment constraints that must be met

diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 11f284be6387c..9e77f795ea8ce 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8055,6 +8055,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
           TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
+        if (TLI->optimizeExtendOrTruncateConversion(
+                I, LI->getLoopFor(I->getParent())))
+          return true;
+
         bool MadeChange = optimizeExt(I);
         return MadeChange | optimizeExtUses(I);
       }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f1c48f9ea6f64..0de2645aa23f2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -13183,6 +13184,60 @@ bool AArch64TargetLowering::shouldSinkOperands(
   return false;
 }
 
+static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
+  Value *Op = ZExt->getOperand(0);
+  auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType());
+  auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType());
+  unsigned NumElts = SrcTy->getNumElements();
+  IRBuilder<> Builder(ZExt);
+  SmallVector<int> Mask(4 * NumElts, NumElts);
+  // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
+  // replace the original ZExt. This can later be lowered to a set of tbl
+  // instructions.
+  for (unsigned i = 0; i < NumElts; i++) {
+    if (IsLittleEndian)
+      Mask[i * 4] = i;
+    else
+      Mask[i * 4 + 3] = i;
+  }
+
+  auto *FirstEltZero = Builder.CreateInsertElement(
+      PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
+  Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
+  Result = Builder.CreateBitCast(Result, DstTy);
+  ZExt->replaceAllUsesWith(Result);
+  ZExt->eraseFromParent();
+}
+
+bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
+                                                               Loop *L) const {
+  // Try to optimize conversions using tbl. This requires materializing constant
+  // index vectors, which can increase code size and add loads. Skip the
+  // transform unless the conversion is in a loop block guaranteed to execute
+  // and we are not optimizing for size.
+  Function *F = I->getParent()->getParent();
+  if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
+      F->hasOptSize())
+    return false;
+
+  auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
+  auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
+  if (!SrcTy || !DstTy)
+    return false;
+
+  // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
+  // lowered to either 2 or 4 tbl instructions to insert the original i8
+  // elements into i32 lanes.
+  auto *ZExt = dyn_cast<ZExtInst>(I);
+  if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
+      SrcTy->getElementType()->isIntegerTy(8) &&
+      DstTy->getElementType()->isIntegerTy(32)) {
+    createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
+    return true;
+  }
+  return false;
+}
+
 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
                                           Align &RequiredAligment) const {
   if (!LoadedType.isSimple() ||

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index a5552caa9bb09..7b1b0e5f3aa2a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -606,6 +606,9 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
 
+  bool optimizeExtendOrTruncateConversion(Instruction *I,
+                                          Loop *L) const override;
+
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

diff  --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index fa0db82b7a9a4..6f16bc02911e2 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -573,35 +573,53 @@ entry:
 define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x9, lCPI24_0 at PAGE
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    adrp x10, lCPI24_1 at PAGE
+; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:    adrp x11, lCPI24_2 at PAGE
+; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:    adrp x12, lCPI24_3 at PAGE
+; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:    ldr q0, [x9, lCPI24_0 at PAGEOFF]
+; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:    ldr q1, [x10, lCPI24_1 at PAGEOFF]
+; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:    ldr q3, [x11, lCPI24_2 at PAGEOFF]
+; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:    ldr q4, [x12, lCPI24_3 at PAGEOFF]
 ; CHECK-NEXT:  LBB24_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q1, [x0, x8]
+; CHECK-NEXT:    ldr q5, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v2, v1, v0
-; CHECK-NEXT:    ushll2.8h v3, v1, #0
-; CHECK-NEXT:    sshll2.8h v4, v2, #0
-; CHECK-NEXT:    ushll2.4s v5, v3, #0
-; CHECK-NEXT:    ushll.4s v3, v3, #0
-; CHECK-NEXT:    sshll2.4s v6, v4, #0
-; CHECK-NEXT:    sshll.4s v4, v4, #0
-; CHECK-NEXT:    ushll.8h v1, v1, #0
-; CHECK-NEXT:    sshll.8h v2, v2, #0
+; CHECK-NEXT:    cmgt.16b v6, v5, v2
+; CHECK-NEXT:    tbl.16b v7, { v5 }, v0
+; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
+; CHECK-NEXT:    sshll2.8h v18, v6, #0
+; CHECK-NEXT:    tbl.16b v17, { v5 }, v3
+; CHECK-NEXT:    sshll2.4s v19, v18, #0
+; CHECK-NEXT:    sshll.4s v18, v18, #0
+; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
+; CHECK-NEXT:    sshll.8h v6, v6, #0
+; CHECK-NEXT:    and.16b v7, v7, v19
+; CHECK-NEXT:    and.16b v16, v16, v18
+; CHECK-NEXT:    stp q16, q7, [x1, #32]
+; CHECK-NEXT:    sshll2.4s v7, v6, #0
+; CHECK-NEXT:    sshll.4s v6, v6, #0
+; CHECK-NEXT:    and.16b v7, v17, v7
 ; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    and.16b v3, v3, v4
-; CHECK-NEXT:    stp q3, q5, [x1, #32]
-; CHECK-NEXT:    sshll2.4s v4, v2, #0
-; CHECK-NEXT:    sshll.4s v2, v2, #0
-; CHECK-NEXT:    ushll2.4s v3, v1, #0
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    and.16b v3, v3, v4
-; CHECK-NEXT:    and.16b v1, v1, v2
-; CHECK-NEXT:    stp q1, q3, [x1], #64
+; CHECK-NEXT:    stp q5, q7, [x1], #64
 ; CHECK-NEXT:    b.ne LBB24_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
+; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
 entry:
   br label %loop
 
@@ -627,23 +645,23 @@ exit:
 define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x9, lCPI25_0 at PAGE
-; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    adrp x10, lCPI25_1 at PAGE
-; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x11, lCPI25_2 at PAGE
-; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    adrp x12, lCPI25_3 at PAGE
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI25_0 at PAGEOFF]
-; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI25_1 at PAGEOFF]
-; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    ldr q3, [x11, lCPI25_2 at PAGEOFF]
-; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    ldr q4, [x12, lCPI25_3 at PAGEOFF]
 ; CHECK-NEXT:  LBB25_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -670,10 +688,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-NEXT:    b.ne LBB25_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
+; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
 entry:
   br label %loop
 
@@ -700,23 +718,23 @@ exit:
 define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x9, lCPI26_0 at PAGE
-; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    adrp x10, lCPI26_1 at PAGE
-; CHECK-NEXT:  Lloh12:
+; CHECK-NEXT:  Lloh20:
 ; CHECK-NEXT:    adrp x11, lCPI26_2 at PAGE
-; CHECK-NEXT:  Lloh13:
+; CHECK-NEXT:  Lloh21:
 ; CHECK-NEXT:    adrp x12, lCPI26_3 at PAGE
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh14:
+; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI26_0 at PAGEOFF]
-; CHECK-NEXT:  Lloh15:
+; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI26_1 at PAGEOFF]
-; CHECK-NEXT:  Lloh16:
+; CHECK-NEXT:  Lloh24:
 ; CHECK-NEXT:    ldr q3, [x11, lCPI26_2 at PAGEOFF]
-; CHECK-NEXT:  Lloh17:
+; CHECK-NEXT:  Lloh25:
 ; CHECK-NEXT:    ldr q4, [x12, lCPI26_3 at PAGEOFF]
 ; CHECK-NEXT:  LBB26_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -743,10 +761,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-NEXT:    b.ne LBB26_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
-; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh25
+; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh24
+; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh23
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh22
 entry:
   br label %loop
 

diff  --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 0daf3714e3a62..6d2615e7f42e8 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2,31 +2,199 @@
 ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s
 
+; CHECK-LABEL: lCPI0_0:
+; CHECK-NEXT:    .byte   0                               ; 0x0
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   1                               ; 0x1
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   2                               ; 0x2
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   3                               ; 0x3
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:lCPI0_1:
+; CHECK-NEXT:    .byte   4                               ; 0x4
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   5                               ; 0x5
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   6                               ; 0x6
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   7                               ; 0x7
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:lCPI0_2:
+; CHECK-NEXT:    .byte   8                               ; 0x8
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   9                               ; 0x9
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   10                              ; 0xa
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   11                              ; 0xb
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:lCPI0_3:
+; CHECK-NEXT:    .byte   12                              ; 0xc
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   13                              ; 0xd
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   14                              ; 0xe
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   15                              ; 0xf
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+
+; CHECK-BE: .LCPI0_0:
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	0                               // 0x0
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	1                               // 0x1
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	2                               // 0x2
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	3                               // 0x3
+; CHECK-BE-NEXT: .LCPI0_1:
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	4                               // 0x4
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	5                               // 0x5
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	6                               // 0x6
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	7                               // 0x7
+; CHECK-BE-NEXT: .LCPI0_2:
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	8                               // 0x8
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	9                               // 0x9
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	10                              // 0xa
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	11                              // 0xb
+; CHECK-BE-NEXT: .LCPI0_3:
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	12                              // 0xc
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	13                              // 0xd
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	14                              // 0xe
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	15                              // 0xf
+
 ; It's profitable to convert the zext to a shuffle, which in turn will be
 ; lowered to 4 tbl instructions. The masks are materialized outside the loop.
 define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x9, lCPI0_0 at PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    adrp x10, lCPI0_1 at PAGE
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x11, lCPI0_2 at PAGE
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    adrp x12, lCPI0_3 at PAGE
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:    ldr q0, [x9, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:    ldr q1, [x10, lCPI0_1 at PAGEOFF]
+; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:    ldr q2, [x11, lCPI0_2 at PAGEOFF]
+; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:    ldr q3, [x12, lCPI0_3 at PAGEOFF]
 ; CHECK-NEXT:  LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ldr q4, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ushll2.8h v1, v0, #0
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v2, v1, #0
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    ushll2.4s v3, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
-; CHECK-NEXT:    stp q0, q3, [x1], #64
+; CHECK-NEXT:    tbl.16b v5, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v6, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
+; CHECK-NEXT:    stp q6, q5, [x1, #32]
+; CHECK-NEXT:    stp q4, q7, [x1], #64
 ; CHECK-NEXT:    b.ne LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh1, Lloh5
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh4
 ;
 ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_0
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_0
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_1
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_1
+; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_2
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_2
+; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_3
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_3
+; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
 ; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:  .LBB0_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -34,20 +202,18 @@ define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    cmp x8, #128
-; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
-; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
+; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v0.16b
+; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v1.16b
+; CHECK-BE-NEXT:    st1 { v5.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v6.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v7.16b }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB0_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -393,39 +559,123 @@ exit:
   ret void
 }
 
+; CHECK-LABEL: lCPI6_0:
+; CHECK-NEXT:     .byte   0                               ; 0x0
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   1                               ; 0x1
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   2                               ; 0x2
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   3                               ; 0x3
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT: lCPI6_1:
+; CHECK-NEXT:     .byte   4                               ; 0x4
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   5                               ; 0x5
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   6                               ; 0x6
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   7                               ; 0x7
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+
+; CHECK-BE:       .LCPI6_0:
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	0                               // 0x0
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	1                               // 0x1
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	2                               // 0x2
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	3                               // 0x3
+; CHECK-BE-NEXT: .LCPI6_1:
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	4                               // 0x4
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	5                               // 0x5
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	6                               // 0x6
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	255                             // 0xff
+; CHECK-BE-NEXT: 	.byte	7                               // 0x7
+
 define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:    adrp x9, lCPI6_0 at PAGE
+; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:    adrp x10, lCPI6_1 at PAGE
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:    ldr q0, [x9, lCPI6_0 at PAGEOFF]
+; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:    ldr q1, [x10, lCPI6_1 at PAGEOFF]
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    ldr d2, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v1, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    stp q0, q1, [x1], #64
+; CHECK-NEXT:    tbl.16b v3, { v2 }, v1
+; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
+; CHECK-NEXT:    stp q2, q3, [x1], #64
 ; CHECK-NEXT:    b.ne LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh9, Lloh11
+; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh10
 ;
 ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    adrp x8, .LCPI6_0
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI6_0
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI6_1
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI6_1
+; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
 ; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:  .LBB6_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    cmp x8, #128
-; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT:    ld1 { v2.8b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x1]
+; CHECK-BE-NEXT:    tbl v3.16b, { v2.16b }, v0.16b
+; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v1.16b
+; CHECK-BE-NEXT:    st1 { v3.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v2.16b }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB6_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret

diff  --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
index 77dfcd2f320e0..7de6718d41edf 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
@@ -15,10 +15,11 @@ define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
-; CHECK-NEXT:    [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 5, i32 16, i32 16, i32 16, i32 6, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 9, i32 16, i32 16, i32 16, i32 10, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32>
 ; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64
+; CHECK-NEXT:    store <16 x i32> [[TMP1]], <16 x i32>* [[DST_GEP_CAST]], align 64
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]


        


More information about the llvm-commits mailing list