[llvm] 5610269 - [Hexagon] Propagate metadata in Hexagon Vector Combine

Sat May 8 13:16:46 PDT 2021

Author: Krzysztof Parzyszek
Date: 2021-05-08T14:35:55-05:00
New Revision: 561026936bd244f31a1b8dbe953680b20994ec2f

URL: https://github.com/llvm/llvm-project/commit/561026936bd244f31a1b8dbe953680b20994ec2f
DIFF: https://github.com/llvm/llvm-project/commit/561026936bd244f31a1b8dbe953680b20994ec2f.diff

LOG: [Hexagon] Propagate metadata in Hexagon Vector Combine

Added: 
    llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll

Modified: 
    llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 1438381ce1bfe..2ddbfd2657948 100644

--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -22,12 +22,14 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
@@ -179,12 +181,13 @@ class AlignVectors {
 
   struct ByteSpan {
     struct Segment {
+      // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
       Segment(Value *Val, int Begin, int Len)
           : Val(Val), Start(Begin), Size(Len) {}
       Segment(const Segment &Seg) = default;
-      Value *Val;
-      int Start;
-      int Size;
+      Value *Val; // Value representable as a sequence of bytes.
+      int Start;  // First byte of the value that belongs to the segment.
+      int Size;   // Number of bytes in the segment.
     };
 
     struct Block {
@@ -192,13 +195,14 @@ class AlignVectors {
       Block(Value *Val, int Off, int Len, int Pos)
           : Seg(Val, Off, Len), Pos(Pos) {}
       Block(const Block &Blk) = default;
-      Segment Seg;
-      int Pos;
+      Segment Seg; // Value segment.
+      int Pos;     // Position (offset) of the segment in the Block.
     };
 
     int extent() const;
     ByteSpan section(int Start, int Length) const;
     ByteSpan &shift(int Offset);
+    SmallVector<Value *, 8> values() const;
 
     int size() const { return Blocks.size(); }
     Block &operator[](int i) { return Blocks[i]; }
@@ -354,6 +358,13 @@ auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
   return *this;
 }
 
+auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
+  SmallVector<Value *, 8> Values(Blocks.size());
+  for (int i = 0, e = Blocks.size(); i != e; ++i)
+    Values[i] = Blocks[i].Seg.Val;
+  return Values;
+}
+
 auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
   const auto *C = dyn_cast<ConstantInt>(V);
   assert(C && "Alignment must be a compile-time constant integer");
@@ -763,28 +774,37 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
 
   Type *SecTy = HVC.getByteTy(ScLen);
   int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+  bool DoAlign = !HVC.isZero(AlignVal);
 
   if (Move.IsLoad) {
     ByteSpan ASpan;
     auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
     auto *Undef = UndefValue::get(SecTy);
 
-    for (int i = 0; i != NumSectors + 1; ++i) {
+    for (int i = 0; i != NumSectors + DoAlign; ++i) {
       Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
       // FIXME: generate a predicated load?
       Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
+      // If vector shifting is potentially needed, accumulate metadata
+      // from source sections of twice the load width.
+      int Start = (i - DoAlign) * ScLen;
+      int Width = (1 + DoAlign) * ScLen;
+      propagateMetadata(cast<Instruction>(Load),
+                        VSpan.section(Start, Width).values());
       ASpan.Blocks.emplace_back(Load, ScLen, i * ScLen);
     }
 
-    for (int j = 0; j != NumSectors; ++j) {
-      ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
-                                      ASpan[j + 1].Seg.Val, AlignVal);
+    if (DoAlign) {
+      for (int j = 0; j != NumSectors; ++j) {
+        ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
+                                        ASpan[j + 1].Seg.Val, AlignVal);
+      }
     }
 
     for (ByteSpan::Block &B : VSpan) {
-      ByteSpan Section = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
+      ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
       Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
-      for (ByteSpan::Block &S : Section) {
+      for (ByteSpan::Block &S : ASection) {
         Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
         Accum =
             HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
@@ -817,13 +837,13 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
 
     // Create an extra "undef" sector at the beginning and at the end.
     // They will be used as the left/right filler in the vlalign step.
-    for (int i = -1; i != NumSectors + 1; ++i) {
+    for (int i = -DoAlign; i != NumSectors + DoAlign; ++i) {
       // For stores, the size of each section is an aligned vector length.
       // Adjust the store offsets relative to the section start offset.
-      ByteSpan Section = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
+      ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
       Value *AccumV = UndefValue::get(SecTy);
       Value *AccumM = HVC.getNullValue(SecTy);
-      for (ByteSpan::Block &S : Section) {
+      for (ByteSpan::Block &S : VSection) {
         Value *Pay = getPayload(S.Seg.Val);
         Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
                                   Pay->getType(), HVC.getByteTy());
@@ -837,19 +857,29 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
     }
 
     // vlalign
-    for (int j = 1; j != NumSectors + 2; ++j) {
-      ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanV[j - 1].Seg.Val,
-                                           ASpanV[j].Seg.Val, AlignVal);
-      ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanM[j - 1].Seg.Val,
-                                           ASpanM[j].Seg.Val, AlignVal);
+    if (DoAlign) {
+      for (int j = 1; j != NumSectors + 2; ++j) {
+        ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanV[j - 1].Seg.Val,
+                                             ASpanV[j].Seg.Val, AlignVal);
+        ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanM[j - 1].Seg.Val,
+                                             ASpanM[j].Seg.Val, AlignVal);
+      }
     }
 
-    for (int i = 0; i != NumSectors + 1; ++i) {
+    for (int i = 0; i != NumSectors + DoAlign; ++i) {
       Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
       Value *Val = ASpanV[i].Seg.Val;
       Value *Mask = ASpanM[i].Seg.Val; // bytes
-      if (!HVC.isUndef(Val) && !HVC.isZero(Mask))
-        createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
+      if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
+        Value *Store = createAlignedStore(Builder, Val, Ptr, ScLen,
+                                          HVC.vlsb(Builder, Mask));
+        // If vector shifting is potentially needed, accumulate metadata
+        // from source sections of twice the store width.
+        int Start = (i - DoAlign) * ScLen;
+        int Width = (1 + DoAlign) * ScLen;
+        propagateMetadata(cast<Instruction>(Store),
+                          VSpan.section(Start, Width).values());
+      }
     }
   }
 

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
new file mode 100644
index 0000000000000..9d8074177a1d3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
@@ -0,0 +1,299 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=hexagon -S -hexagon-vc -instcombine < %s | FileCheck %s
+
+; Check that Hexagon Vector Combine propagates (TBAA) metadata to the
+; generated output. (Use instcombine to clean the output up a bit.)
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Two unaligned loads, both with the same TBAA tag.
+;
+define <64 x i16> @f0(i16* %a0, i32 %a1) #0 {
+; CHECK-LABEL: @f0(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[A1:%.*]], 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, i16* [[A0:%.*]], i32 [[V0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to <64 x i16>*
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP1]] to <32 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i32>, <32 x i32>* [[TMP4]], align 128, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i16>* [[TMP6]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <128 x i8>, <128 x i8>* [[TMP7]], align 128, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i16>* [[TMP9]] to <32 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    ret <64 x i16> [[V8]]
+;
+b0:
+  %v0 = add i32 %a1, 64
+  %v1 = getelementptr i16, i16* %a0, i32 %v0
+  %v2 = bitcast i16* %v1 to <64 x i16>*
+  %v3 = load <64 x i16>, <64 x i16>* %v2, align 2, !tbaa !0
+  %v4 = add i32 %a1, 128
+  %v5 = getelementptr i16, i16* %a0, i32 %v4
+  %v6 = bitcast i16* %v5 to <64 x i16>*
+  %v7 = load <64 x i16>, <64 x i16>* %v6, align 2, !tbaa !0
+  %v8 = add <64 x i16> %v3, %v7
+  ret <64 x i16> %v8
+}
+
+; Two unaligned loads, only one with a TBAA tag.
+;
+define <64 x i16> @f1(i16* %a0, i32 %a1) #0 {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[A1:%.*]], 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, i16* [[A0:%.*]], i32 [[V0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to <64 x i16>*
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP1]] to <32 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i32>, <32 x i32>* [[TMP4]], align 128, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i16>* [[TMP6]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <128 x i8>, <128 x i8>* [[TMP7]], align 128
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i16>* [[TMP9]] to <32 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    ret <64 x i16> [[V8]]
+;
+b0:
+  %v0 = add i32 %a1, 64
+  %v1 = getelementptr i16, i16* %a0, i32 %v0
+  %v2 = bitcast i16* %v1 to <64 x i16>*
+  %v3 = load <64 x i16>, <64 x i16>* %v2, align 2, !tbaa !0
+  %v4 = add i32 %a1, 128
+  %v5 = getelementptr i16, i16* %a0, i32 %v4
+  %v6 = bitcast i16* %v5 to <64 x i16>*
+  %v7 = load <64 x i16>, <64 x i16>* %v6, align 2
+  %v8 = add <64 x i16> %v3, %v7
+  ret <64 x i16> %v8
+}
+
+; Two unaligned loads, with 
diff erent TBAA tags.
+;
+define <64 x i16> @f2(i16* %a0, i32 %a1) #0 {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[A1:%.*]], 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, i16* [[A0:%.*]], i32 [[V0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to <64 x i16>*
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP1]] to <32 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i32>, <32 x i32>* [[TMP4]], align 128, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i16>* [[TMP6]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <128 x i8>, <128 x i8>* [[TMP7]], align 128
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i16>* [[TMP9]] to <32 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    ret <64 x i16> [[V8]]
+;
+b0:
+  %v0 = add i32 %a1, 64
+  %v1 = getelementptr i16, i16* %a0, i32 %v0
+  %v2 = bitcast i16* %v1 to <64 x i16>*
+  %v3 = load <64 x i16>, <64 x i16>* %v2, align 2, !tbaa !0
+  %v4 = add i32 %a1, 128
+  %v5 = getelementptr i16, i16* %a0, i32 %v4
+  %v6 = bitcast i16* %v5 to <64 x i16>*
+  %v7 = load <64 x i16>, <64 x i16>* %v6, align 2, !tbaa !3
+  %v8 = add <64 x i16> %v3, %v7
+  ret <64 x i16> %v8
+}
+
+; Two unaligned stores, both with the same TBAA tag.
+;
+define void @f3(i16* %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[A1:%.*]], 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, i16* [[A0:%.*]], i32 [[V0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to <64 x i16>*
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i16> [[A2:%.*]] to <32 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> [[TMP4]], <32 x i32> undef, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i32> [[TMP5]] to <128 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <32 x i32> zeroinitializer, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i32> [[TMP7]] to <128 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <64 x i16> [[A3:%.*]] to <32 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i16> [[A2]] to <32 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> [[TMP9]], <32 x i32> [[TMP10]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i32> [[TMP11]] to <128 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <128 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i16> [[A3]] to <32 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> undef, <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <128 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i32> [[TMP18]] to <128 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i32 [[TMP1]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <128 x i8> [[TMP8]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP6]], <128 x i8>* [[TMP20]], i32 128, <128 x i1> [[TMP21]]), !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <64 x i16>* [[TMP22]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc <128 x i8> [[TMP14]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP12]], <128 x i8>* [[TMP23]], i32 128, <128 x i1> [[TMP24]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <64 x i16>* [[TMP25]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc <128 x i8> [[TMP19]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP17]], <128 x i8>* [[TMP26]], i32 128, <128 x i1> [[TMP27]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
+;
+b0:
+  %v0 = add i32 %a1, 64
+  %v1 = getelementptr i16, i16* %a0, i32 %v0
+  %v2 = bitcast i16* %v1 to <64 x i16>*
+  store <64 x i16> %a2, <64 x i16>* %v2, align 2, !tbaa !5
+  %v3 = add i32 %a1, 128
+  %v4 = getelementptr i16, i16* %a0, i32 %v3
+  %v5 = bitcast i16* %v4 to <64 x i16>*
+  store <64 x i16> %a3, <64 x i16>* %v5, align 2, !tbaa !5
+  ret void
+}
+
+; Two unaligned stores, only one with a TBAA tag.
+;
+define void @f4(i16* %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
+; CHECK-LABEL: @f4(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[A1:%.*]], 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, i16* [[A0:%.*]], i32 [[V0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to <64 x i16>*
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i16> [[A2:%.*]] to <32 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> [[TMP4]], <32 x i32> undef, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i32> [[TMP5]] to <128 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <32 x i32> zeroinitializer, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i32> [[TMP7]] to <128 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <64 x i16> [[A3:%.*]] to <32 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i16> [[A2]] to <32 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> [[TMP9]], <32 x i32> [[TMP10]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i32> [[TMP11]] to <128 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <128 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i16> [[A3]] to <32 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> undef, <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <128 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i32> [[TMP18]] to <128 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i32 [[TMP1]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <128 x i8> [[TMP8]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP6]], <128 x i8>* [[TMP20]], i32 128, <128 x i1> [[TMP21]])
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <64 x i16>* [[TMP22]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc <128 x i8> [[TMP14]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP12]], <128 x i8>* [[TMP23]], i32 128, <128 x i1> [[TMP24]])
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <64 x i16>* [[TMP25]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc <128 x i8> [[TMP19]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP17]], <128 x i8>* [[TMP26]], i32 128, <128 x i1> [[TMP27]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
+;
+b0:
+  %v0 = add i32 %a1, 64
+  %v1 = getelementptr i16, i16* %a0, i32 %v0
+  %v2 = bitcast i16* %v1 to <64 x i16>*
+  store <64 x i16> %a2, <64 x i16>* %v2, align 2
+  %v3 = add i32 %a1, 128
+  %v4 = getelementptr i16, i16* %a0, i32 %v3
+  %v5 = bitcast i16* %v4 to <64 x i16>*
+  store <64 x i16> %a3, <64 x i16>* %v5, align 2, !tbaa !5
+  ret void
+}
+
+; Two unaligned store, with 
diff erent TBAA tags.
+;
+define void @f5(i16* %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
+; CHECK-LABEL: @f5(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[A1:%.*]], 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, i16* [[A0:%.*]], i32 [[V0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to <64 x i16>*
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i16* [[V1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <64 x i16> [[A2:%.*]] to <32 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> [[TMP4]], <32 x i32> undef, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i32> [[TMP5]] to <128 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <32 x i32> zeroinitializer, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i32> [[TMP7]] to <128 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <64 x i16> [[A3:%.*]] to <32 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i16> [[A2]] to <32 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> [[TMP9]], <32 x i32> [[TMP10]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i32> [[TMP11]] to <128 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <128 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i16> [[A3]] to <32 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> undef, <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <128 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i32> [[TMP18]] to <128 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i32 [[TMP1]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <128 x i8> [[TMP8]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP6]], <128 x i8>* [[TMP20]], i32 128, <128 x i1> [[TMP21]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <64 x i16>* [[TMP22]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc <128 x i8> [[TMP14]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP12]], <128 x i8>* [[TMP23]], i32 128, <128 x i1> [[TMP24]])
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr <64 x i16>, <64 x i16>* [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <64 x i16>* [[TMP25]] to <128 x i8>*
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc <128 x i8> [[TMP19]] to <128 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[TMP17]], <128 x i8>* [[TMP26]], i32 128, <128 x i1> [[TMP27]]), !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+b0:
+  %v0 = add i32 %a1, 64
+  %v1 = getelementptr i16, i16* %a0, i32 %v0
+  %v2 = bitcast i16* %v1 to <64 x i16>*
+  store <64 x i16> %a2, <64 x i16>* %v2, align 2, !tbaa !5
+  %v3 = add i32 %a1, 128
+  %v4 = getelementptr i16, i16* %a0, i32 %v3
+  %v5 = bitcast i16* %v4 to <64 x i16>*
+  store <64 x i16> %a3, <64 x i16>* %v5, align 2, !tbaa !7
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv66" "target-features"="+hvx,+hvx-length128b" }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"load type 1", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"load type 2", !2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"store type 1", !2}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"store type 2", !2}