[llvm] 0147284 - Revert "[Hexagon] Improve casting of boolean HVX vectors to scalars"

Thu Feb 27 03:01:24 PST 2020

Author: Kirill Bobyrev
Date: 2020-02-27T11:58:32+01:00
New Revision: 014728413f354c8f44375074b331e68ce194bbd2

URL: https://github.com/llvm/llvm-project/commit/014728413f354c8f44375074b331e68ce194bbd2
DIFF: https://github.com/llvm/llvm-project/commit/014728413f354c8f44375074b331e68ce194bbd2.diff

LOG: Revert "[Hexagon] Improve casting of boolean HVX vectors to scalars"

This reverts commit 7691790dfd1011d08f5468f63952d7690755aad4.

The patch is failing tests with MSAN:
http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-fast/builds/39054/steps/check-llvm%20msan/logs/stdio

Added: 
    

Modified: 
    llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
    llvm/lib/Target/Hexagon/HexagonISelLowering.h
    llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
    llvm/test/CodeGen/Hexagon/autohvx/isel-hvx-pred-bitcast.ll
    llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
    llvm/test/CodeGen/Hexagon/store-vector-pred.ll

Removed: 
    llvm/test/CodeGen/Hexagon/autohvx/isel-store-bitcast-v128i1.ll


################################################################################
diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 2e7a65ba0a0d..f3967573c70e 100644

--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1681,6 +1681,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, VT, Custom);
   }
 
+  setOperationAction(ISD::STORE, MVT::v128i1, Custom);
+
   for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v8i8, MVT::v2i32, MVT::v4i16,
                  MVT::v2i32}) {
     setCondCodeAction(ISD::SETNE,  VT, Expand);
@@ -1694,6 +1696,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
 
   // Custom-lower bitcasts from i8 to v8i1.
   setOperationAction(ISD::BITCAST,        MVT::i8,    Custom);
+  setOperationAction(ISD::BITCAST,        MVT::i32,   Custom);
+  setOperationAction(ISD::BITCAST,        MVT::i64,   Custom);
   setOperationAction(ISD::SETCC,          MVT::v2i16, Custom);
   setOperationAction(ISD::VSELECT,        MVT::v4i8,  Custom);
   setOperationAction(ISD::VSELECT,        MVT::v2i16, Custom);
@@ -3077,12 +3081,6 @@ void
 HexagonTargetLowering::LowerOperationWrapper(SDNode *N,
                                              SmallVectorImpl<SDValue> &Results,
                                              SelectionDAG &DAG) const {
-  if (isHvxOperation(N)) {
-    LowerHvxOperationWrapper(N, Results, DAG);
-    if (!Results.empty())
-      return;
-  }
-
   // We are only custom-lowering stores to verify the alignment of the
   // address if it is a compile-time constant. Since a store can be modified
   // during type-legalization (the value being stored may need legalization),
@@ -3096,12 +3094,6 @@ void
 HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
-  if (isHvxOperation(N)) {
-    ReplaceHvxNodeResults(N, Results, DAG);
-    if (!Results.empty())
-      return;
-  }
-
   const SDLoc &dl(N);
   switch (N->getOpcode()) {
     case ISD::SRL:
@@ -3386,25 +3378,12 @@ EVT HexagonTargetLowering::getOptimalMemOpType(
   return MVT::Other;
 }
 
-bool HexagonTargetLowering::allowsMemoryAccess(LLVMContext &Context,
-      const DataLayout &DL, EVT VT, unsigned AddrSpace, unsigned Alignment,
-      MachineMemOperand::Flags Flags, bool *Fast) const {
-  MVT SVT = VT.getSimpleVT();
-  if (Subtarget.isHVXVectorType(SVT, true))
-    return allowsHvxMemoryAccess(SVT, Alignment, Flags, Fast);
-  return TargetLoweringBase::allowsMemoryAccess(
-              Context, DL, VT, AddrSpace, Alignment, Flags, Fast);
-}
-
 bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(
-      EVT VT, unsigned AddrSpace, unsigned Alignment,
-      MachineMemOperand::Flags Flags, bool *Fast) const {
-  MVT SVT = VT.getSimpleVT();
-  if (Subtarget.isHVXVectorType(SVT, true))
-    return allowsHvxMisalignedMemoryAccesses(SVT, Alignment, Flags, Fast);
+    EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags,
+    bool *Fast) const {
   if (Fast)
     *Fast = false;
-  return false;
+  return Subtarget.isHVXVectorType(VT.getSimpleVT());
 }
 
 std::pair<const TargetRegisterClass*, uint8_t>

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 49871306f017..7239c1dc71d2 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -305,12 +305,8 @@ namespace HexagonISD {
     EVT getOptimalMemOpType(const MemOp &Op,
                             const AttributeList &FuncAttributes) const override;
 
-    bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
-        unsigned AddrSpace, unsigned Alignment, MachineMemOperand::Flags Flags,
-        bool *Fast) const override;
-
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-        unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
+        unsigned Align, MachineMemOperand::Flags Flags, bool *Fast)
         const override;
 
     /// Returns relocation base for the given PIC jumptable.
@@ -408,11 +404,6 @@ namespace HexagonISD {
     VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
     SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
 
-    bool allowsHvxMemoryAccess(MVT VecTy, unsigned Alignment,
-        MachineMemOperand::Flags Flags, bool *Fast) const;
-    bool allowsHvxMisalignedMemoryAccesses(MVT VecTy, unsigned Align,
-        MachineMemOperand::Flags Flags, bool *Fast) const;
-
     bool isHvxSingleTy(MVT Ty) const;
     bool isHvxPairTy(MVT Ty) const;
     bool isHvxBoolTy(MVT Ty) const;
@@ -447,8 +438,6 @@ namespace HexagonISD {
                                    const SDLoc &dl, SelectionDAG &DAG) const;
     SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
                                 bool ZeroExt, SelectionDAG &DAG) const;
-    SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
-                            SelectionDAG &DAG) const;
 
     SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
@@ -478,12 +467,8 @@ namespace HexagonISD {
         const override;
 
     bool isHvxOperation(SDValue Op) const;
-    bool isHvxOperation(SDNode *N) const;
     SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
-    void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const;
-    void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) const;
+
     SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   };
 

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 153087a39a3f..2b5257e47712 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -65,15 +65,6 @@ HexagonTargetLowering::initializeHVXLowering() {
     AddPromotedToType(Opc, FromTy, ToTy);
   };
 
-  // Handle bitcasts of vector predicates to scalars (e.g. v32i1 to i32).
-  // Note: v16i1 -> i16 is handled in type legalization instead of op
-  // legalization.
-  setOperationAction(ISD::BITCAST,            MVT::i16,   Custom);
-  setOperationAction(ISD::BITCAST,            MVT::i32,   Custom);
-  setOperationAction(ISD::BITCAST,            MVT::i64,   Custom);
-  setOperationAction(ISD::BITCAST,            MVT::v16i1, Custom);
-  setOperationAction(ISD::BITCAST,            MVT::v128i1, Custom);
-  setOperationAction(ISD::BITCAST,            MVT::i128, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE,     ByteV,      Legal);
   setOperationAction(ISD::VECTOR_SHUFFLE,     ByteW,      Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -299,25 +290,6 @@ HexagonTargetLowering::isHvxBoolTy(MVT Ty) const {
          Ty.getVectorElementType() == MVT::i1;
 }
 
-bool
-HexagonTargetLowering::allowsHvxMemoryAccess(MVT VecTy, unsigned Alignment,
-        MachineMemOperand::Flags Flags, bool *Fast) const {
-  // Bool vectors are excluded by default, but make it explicit to
-  // emphasize that bool vectors cannot be loaded or stored.
-  return Subtarget.isHVXVectorType(VecTy, /*IncludeBool=*/false);
-}
-
-bool
-HexagonTargetLowering::allowsHvxMisalignedMemoryAccesses(MVT VecTy,
-        unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const {
-  if (!Subtarget.isHVXVectorType(VecTy))
-    return false;
-  // XXX Should this be false?  vmemu are a bit slower than vmem.
-  if (Fast)
-    *Fast = true;
-  return true;
-}
-
 SDValue
 HexagonTargetLowering::convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
                                           SelectionDAG &DAG) const {
@@ -1057,61 +1029,6 @@ HexagonTargetLowering::extendHvxVectorPred(SDValue VecV, const SDLoc &dl,
   return DAG.getSelect(dl, ResTy, VecV, True, False);
 }
 
-SDValue
-HexagonTargetLowering::compressHvxPred(SDValue VecQ, const SDLoc &dl,
-      MVT ResTy, SelectionDAG &DAG) const {
-  // Given a predicate register VecQ, transfer bits VecQ[0..HwLen-1]
-  // (i.e. the entire predicate register) to bits [0..HwLen-1] of a
-  // vector register. The remaining bits of the vector register are
-  // unspecified.
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  unsigned HwLen = Subtarget.getVectorLength();
-  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
-  MVT PredTy = ty(VecQ);
-  unsigned PredLen = PredTy.getVectorNumElements();
-  assert(HwLen % PredLen == 0);
-  MVT VecTy = MVT::getVectorVT(MVT::getIntegerVT(8*HwLen/PredLen), PredLen);
-
-  Type *Int8Ty = Type::getInt8Ty(*DAG.getContext());
-  SmallVector<Constant*, 128> Tmp;
-  // Create an array of bytes (hex): 01,02,04,08,10,20,40,80, 01,02,04,08,...
-  // These are bytes with the LSB rotated left with respect to their index.
-  for (unsigned i = 0; i != HwLen/8; ++i) {
-    for (unsigned j = 0; j != 8; ++j)
-      Tmp.push_back(ConstantInt::get(Int8Ty, 1u << j));
-  }
-  Constant *CV = ConstantVector::get(Tmp);
-  unsigned Align = HwLen;
-  SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, ByteTy, Align), DAG);
-  SDValue Bytes = DAG.getLoad(ByteTy, dl, DAG.getEntryNode(), CP,
-      MachinePointerInfo::getConstantPool(MF), Align);
-
-  // Select the bytes that correspond to true bits in the vector predicate.
-  SDValue Sel = DAG.getSelect(dl, VecTy, VecQ, DAG.getBitcast(VecTy, Bytes),
-      getZero(dl, VecTy, DAG));
-  // Calculate the OR of all bytes in each group of 8. That will compress
-  // all the individual bits into a single byte.
-  // First, OR groups of 4, via vrmpy with 0x01010101.
-  SDValue All1 =
-      DAG.getSplatBuildVector(MVT::v4i8, dl, DAG.getConstant(1, dl, MVT::i32));
-  SDValue Vrmpy = getInstr(Hexagon::V6_vrmpyub, dl, ByteTy, {Sel, All1}, DAG);
-  // Then rotate the accumulated vector by 4 bytes, and do the final OR.
-  SDValue Rot = getInstr(Hexagon::V6_valignbi, dl, ByteTy,
-      {Vrmpy, Vrmpy, DAG.getTargetConstant(4, dl, MVT::i32)}, DAG);
-  SDValue Vor = DAG.getNode(ISD::OR, dl, ByteTy, {Vrmpy, Rot});
-
-  // Pick every 8th byte and coalesce them at the beginning of the output.
-  // For symmetry, coalesce every 1+8th byte after that, then every 2+8th
-  // byte and so on.
-  SmallVector<int,128> Mask;
-  for (unsigned i = 0; i != HwLen; ++i)
-    Mask.push_back((8*i) % HwLen + i/(HwLen/8));
-  SDValue Collect =
-      DAG.getVectorShuffle(ByteTy, dl, Vor, DAG.getUNDEF(ByteTy), Mask);
-  return DAG.getBitcast(ResTy, Collect);
-}
-
 SDValue
 HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
       const {
@@ -1520,58 +1437,192 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
   return T7;
 }
 
+// This function does the computation needed to bitcast a vector of predicate
+// register to a vector of integers.
 SDValue
-HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
-  SDValue ValQ = Op.getOperand(0);
-  MVT ResTy = ty(Op);
-  MVT VecTy = ty(ValQ);
+HexagonTargetLowering::HvxVecPredBitcastComputation(SDValue Op,
+                                                    SelectionDAG &DAG) const {
   const SDLoc &dl(Op);
+  MVT VecTy;
+  int Length;
+  if (Subtarget.useHVX64BOps()) {
+    VecTy = MVT::getVectorVT(MVT::i32, 16);
+    Length = 2;
+  }
+  if (Subtarget.useHVX128BOps()) {
+    VecTy = MVT::getVectorVT(MVT::i32, 32);
+    Length = 4;
+  }
+  // r0 = ##0x08040201 // Pre-rotated bits per 4 consecutive bytes.
+  SDValue C8421 = DAG.getTargetConstant(0x08040201, dl, MVT::i32);
+  SDValue InstrC8421 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C8421, DAG);
+  // v0 = vand(q0,r0)
+  SDValue Vand =
+      getInstr(Hexagon::V6_vandqrt, dl, VecTy, {Op, InstrC8421}, DAG);
+
+  // Or the bytes in each word into a single byte: that will form packs
+  // of 4 bits of the output.
+  // v1 = valign(v0,v0,#2)
+  SDValue C2 = DAG.getTargetConstant(2, dl, MVT::i32);
+  SDValue Valign =
+      getInstr(Hexagon::V6_valignbi, dl, VecTy, {Vand, Vand, C2}, DAG);
+  // v0 = vor(v0,v1)
+  SDValue Vor = getInstr(Hexagon::V6_vor, dl, VecTy, {Vand, Valign}, DAG);
+  // v1 = valign(v0,v0,#1)
+  SDValue C1 = DAG.getTargetConstant(1, dl, MVT::i32);
+  SDValue Valign1 =
+      getInstr(Hexagon::V6_valignbi, dl, VecTy, {Vor, Vor, C1}, DAG);
+  // v0 = vor(v0,v1)
+  SDValue Vor1 = getInstr(Hexagon::V6_vor, dl, VecTy, {Vor, Valign1}, DAG);
+
+  // Clear all the bytes per word except the lowest one.
+  // r0 = #0xff
+  SDValue Cff = DAG.getTargetConstant(0xff, dl, MVT::i32);
+  SDValue InstrCff = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, Cff, DAG);
+  // v1 = vsplat(r0)
+  SDValue Vsplat = getInstr(Hexagon::V6_lvsplatw, dl, VecTy, InstrCff, DAG);
+  // v0 = vand(v0,v1)
+  SDValue Vand1 = getInstr(Hexagon::V6_vand, dl, VecTy, {Vor1, Vsplat}, DAG);
+
+  // Shift each word left by its index to position the 4-bit packs for oring.
+  // The words 0..8 and 16..31 need to be ored to form the 64-bit output.
+  // r0 = ##.Lshifts
+  // .Lshifts:
+  // .word 0
+  // .word 4
+  // .word 8
+  // .word 12
+  // .word 16
+  // .word 20
+  // .word 24
+  // .word 28
+  // .word 0
+  // .word 4
+  // .word 8
+  // .word 12
+  // .word 16
+  // .word 20
+  // .word 24
+  // .word 28
+  // v1 = vmem(r0+#0)
+  SmallVector<SDValue, 32> Elems;
+  for (int i = 0; i < Length; ++i) {
+    Elems.push_back(DAG.getConstant(0, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(4, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(8, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(12, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(16, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(20, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(24, dl, MVT::i32));
+    Elems.push_back(DAG.getConstant(28, dl, MVT::i32));
+  }
 
-  if (isHvxBoolTy(VecTy) && ResTy.isScalarInteger()) {
-    unsigned HwLen = Subtarget.getVectorLength();
-    MVT WordTy = MVT::getVectorVT(MVT::i32, HwLen/4);
-    SDValue VQ = compressHvxPred(ValQ, dl, WordTy, DAG);
-    unsigned BitWidth = ResTy.getSizeInBits();
-
-    if (BitWidth < 64) {
-      SDValue W0 = extractHvxElementReg(VQ, DAG.getConstant(0, dl, MVT::i32),
-          dl, MVT::i32, DAG);
-      if (BitWidth == 32)
-        return W0;
-      assert(BitWidth < 32u);
-      return DAG.getZExtOrTrunc(W0, dl, ResTy);
-    }
+  SDValue BV = DAG.getBuildVector(VecTy, dl, Elems);
+  // v0.w = vasl(v0.w,v1.w)
+  SDValue Vasl = getInstr(Hexagon::V6_vaslwv, dl, VecTy, {Vand1, BV}, DAG);
+
+  // 3 rounds of oring.
+  // r0 = #16 // HwLen/4
+  SDValue C16 = DAG.getTargetConstant(16, dl, MVT::i32);
+  SDValue InstrC16 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C16, DAG);
+  // v1 = vror(v0,r0)
+  SDValue Vror = getInstr(Hexagon::V6_vror, dl, VecTy, {Vasl, InstrC16}, DAG);
+  // v0 = vor(v0,v1)
+  SDValue Vor2 = getInstr(Hexagon::V6_vor, dl, VecTy, {Vasl, Vror}, DAG);
+  // r0 = #8 // HwLen/8
+  SDValue C8 = DAG.getTargetConstant(8, dl, MVT::i32);
+  SDValue InstrC8 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C8, DAG);
+  // v1 = vror(v0,r0)
+  SDValue Vror1 = getInstr(Hexagon::V6_vror, dl, VecTy, {Vor2, InstrC8}, DAG);
+  // v0 = vor(v0,v1)
+  SDValue Vor3 = getInstr(Hexagon::V6_vor, dl, VecTy, {Vor2, Vror1}, DAG);
+  // r0 = #4 // HwLen/16
+  SDValue C4 = DAG.getTargetConstant(4, dl, MVT::i32);
+  SDValue InstrC4 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C4, DAG);
+  // v1 = vror(v0,r0)
+  SDValue Vror2 = getInstr(Hexagon::V6_vror, dl, VecTy, {Vor3, InstrC4}, DAG);
+  // v0 = vor(v0,v1)
+  SDValue Vor4 = getInstr(Hexagon::V6_vor, dl, VecTy, {Vor3, Vror2}, DAG);
+  return Vor4;
+}
 
-    // The result is >= 64 bits. The only options are 64 or 128.
-    assert(BitWidth == 64 || BitWidth == 128);
-    SmallVector<SDValue,4> Words;
-    for (unsigned i = 0; i != BitWidth/32; ++i) {
-      SDValue W = extractHvxElementReg(
-          VQ, DAG.getConstant(i, dl, MVT::i32), dl, MVT::i32, DAG);
-      Words.push_back(W);
-    }
-    SmallVector<SDValue,2> Combines;
-    assert(Words.size() % 2 == 0);
-    for (unsigned i = 0, e = Words.size(); i < e; i += 2) {
-      SDValue C = DAG.getNode(
-          HexagonISD::COMBINE, dl, MVT::i64, {Words[i], Words[i+1]});
-      Combines.push_back(C);
-    }
+SDValue HexagonTargetLowering::LowerHvxBitcast(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  auto *N = Op.getNode();
+  EVT VT = N->getValueType(0);
+  const SDLoc &dl(Op);
+  SDValue Q0 = N->getOperand(0);
+  EVT VTOp = Q0.getNode()->getValueType(0);
+  if (!(VT == MVT::i64 || VT == MVT::i32) ||
+      !(VTOp == MVT::v64i1 || VTOp == MVT::v32i1))
+    return Op;
 
-    if (BitWidth == 64)
-      return Combines[0];
-
-    // It must be i128. I128 is not a legal type, so this part will be
-    // executed during type legalization. We need to generate code that
-    // the default expansion can break up into smaller pieces.
-    SDValue C0 = DAG.getZExtOrTrunc(Combines[0], dl, ResTy);
-    SDValue C1 = DAG.getNode(ISD::SHL, dl, ResTy,
-        DAG.getZExtOrTrunc(Combines[1], dl, ResTy),
-        DAG.getConstant(64, dl, MVT::i32));
-    return DAG.getNode(ISD::OR, dl, ResTy, C0, C1);
+  SDValue Vor4 = HvxVecPredBitcastComputation(Q0, DAG);
+
+  // The output is v.w[8]:v.w[0]
+  // r3 = #0
+  SDValue C0 = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue InstrC0 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C0, DAG);
+  // r0 = vextract(v0,r3)
+  SDValue Res =
+      getInstr(Hexagon::V6_extractw, dl, MVT::i32, {Vor4, InstrC0}, DAG);
+  if (VT == MVT::i64) {
+    // r3 = #32
+    SDValue C32 = DAG.getTargetConstant(32, dl, MVT::i32);
+    SDValue InstrC32 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C32, DAG);
+    // r1 = vextract(v0,r3)
+    SDValue Vextract =
+        getInstr(Hexagon::V6_extractw, dl, MVT::i32, {Vor4, InstrC32}, DAG);
+    Res = getInstr(Hexagon::A2_combinew, dl, MVT::i64, {Vextract, Res}, DAG);
   }
+  return Res;
+}
 
-  return Op;
+SDValue HexagonTargetLowering::LowerHvxStore(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  auto *N = Op.getNode();
+  const SDLoc &dl(Op);
+  SDValue Q0 = N->getOperand(1);
+  EVT VTOp = Q0.getNode()->getValueType(0);
+  if (Op.getOpcode() != ISD::STORE || VTOp != MVT::v128i1)
+    return Op;
+  SDValue Vor4 = HvxVecPredBitcastComputation(Q0, DAG);
+  // The output is v.w[8]:v.w[0]
+  // r3 = #0
+  SDValue C0 = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue InstrC0 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C0, DAG);
+  // r0 = vextract(v0,r3)
+  SDValue Vextract0 =
+      getInstr(Hexagon::V6_extractw, dl, MVT::i32, {Vor4, InstrC0}, DAG);
+  // r3 = #32
+  SDValue C32 = DAG.getTargetConstant(32, dl, MVT::i32);
+  SDValue InstrC32 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C32, DAG);
+  // r1 = vextract(v0,r3)
+  SDValue Vextract1 =
+      getInstr(Hexagon::V6_extractw, dl, MVT::i32, {Vor4, InstrC32}, DAG);
+  SDValue Combine0 =
+      getInstr(Hexagon::A2_combinew, dl, MVT::i64, {Vextract1, Vextract0}, DAG);
+  // r3 = #64
+  SDValue C64 = DAG.getTargetConstant(64, dl, MVT::i32);
+  SDValue InstrC64 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C64, DAG);
+  // r0 = vextract(v0,r3)
+  SDValue Vextract2 =
+      getInstr(Hexagon::V6_extractw, dl, MVT::i32, {Vor4, InstrC64}, DAG);
+  // r3 = #96
+  SDValue C96 = DAG.getTargetConstant(96, dl, MVT::i32);
+  SDValue InstrC96 = getInstr(Hexagon::A2_tfrsi, dl, MVT::i32, C96, DAG);
+  // r1 = vextract(v0,r3)
+  SDValue Vextract3 =
+      getInstr(Hexagon::V6_extractw, dl, MVT::i32, {Vor4, InstrC96}, DAG);
+  SDValue Combine1 =
+      getInstr(Hexagon::A2_combinew, dl, MVT::i64, {Vextract3, Vextract2}, DAG);
+  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+  SDValue C8 = DAG.getTargetConstant(8, dl, MVT::i32);
+  const SDValue Ops1[] = {ST->getBasePtr(), C8, Combine1, ST->getChain()};
+  SDValue Store1 = getInstr(Hexagon::S2_storerd_io, dl, MVT::Other, Ops1, DAG);
+  const SDValue Ops0[] = {ST->getBasePtr(), C0, Combine0, Store1};
+  SDValue Store0 = getInstr(Hexagon::S2_storerd_io, dl, MVT::Other, Ops0, DAG);
+  return Store0;
 }
 
 SDValue
@@ -1747,6 +1798,7 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SETCC:
     case ISD::INTRINSIC_VOID:          return Op;
     case ISD::INTRINSIC_WO_CHAIN:      return LowerHvxIntrinsic(Op, DAG);
+    case ISD::STORE:                   return LowerHvxStore(Op, DAG);
     // Unaligned loads will be handled by the default lowering.
     case ISD::LOAD:                    return SDValue();
   }
@@ -1756,28 +1808,6 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("Unhandled HVX operation");
 }
 
-void
-HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
-      SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
-}
-
-void
-HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
-      SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
-  unsigned Opc = N->getOpcode();
-  switch (Opc) {
-    case ISD::BITCAST:
-      if (isHvxBoolTy(ty(N->getOperand(0)))) {
-        SDValue Op(N, 0);
-        SDValue C = LowerHvxBitcast(Op, DAG);
-        Results.push_back(C);
-      }
-      break;
-    default:
-      break;
-  }
-}
-
 SDValue
 HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
       const {
@@ -1810,16 +1840,3 @@ HexagonTargetLowering::isHvxOperation(SDValue Op) const {
                         return Subtarget.isHVXVectorType(ty(V), true);
                       });
 }
-
-bool
-HexagonTargetLowering::isHvxOperation(SDNode *N) const {
-  // If the type of any result, or any operand type are HVX vector types,
-  // this is an HVX operation.
-  auto IsHvxTy = [this] (EVT Ty) {
-    return Ty.isSimple() && Subtarget.isHVXVectorType(Ty.getSimpleVT(), true);
-  };
-  auto IsHvxOp = [this] (SDValue Op) {
-    return Subtarget.isHVXVectorType(ty(Op), true);
-  };
-  return llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp);
-}

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-hvx-pred-bitcast.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-hvx-pred-bitcast.ll
index 6bf2cdaf146e..c07251035240 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/isel-hvx-pred-bitcast.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-hvx-pred-bitcast.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; CHECK-LABEL: danny:
-; CHECK: vrmpy
+; CHECK: vand
 define i64 @danny(<64 x i8> %a0, <64 x i8> %a1) #0 {
   %v0 = icmp eq <64 x i8> %a0, %a1
   %v1 = bitcast <64 x i1> %v0 to i64
@@ -9,19 +9,18 @@ define i64 @danny(<64 x i8> %a0, <64 x i8> %a1) #0 {
 }
 
 ; CHECK-LABEL: sammy:
-; CHECK: vrmpy
+; CHECK: vand
 define i32 @sammy(<32 x i16> %a0, <32 x i16> %a1) #0 {
   %v0 = icmp eq <32 x i16> %a0, %a1
   %v1 = bitcast <32 x i1> %v0 to i32
   ret i32 %v1
 }
 
-; CHECK-LABEL: kirby:
-; CHECK: vrmpy
-define i16 @kirby(<16 x i32> %a0, <16 x i32> %a1) #0 {
-  %v0 = icmp eq <16 x i32> %a0, %a1
-  %v1 = bitcast <16 x i1> %v0 to i16
-  ret i16 %v1
-}
+; This one still doesn't work.
+; define i16 @kirby(<16 x i32> %a0, <16 x i32> %a1) #0 {
+;   %v0 = icmp eq <16 x i32> %a0, %a1
+;   %v1 = bitcast <16 x i1> %v0 to i16
+;   ret i16 %v1
+; }
 
 attributes #0 = { nounwind "target-cpu"="hexagonv66" "target-features"="+v66,+hvx,+hvxv66,+hvx-length64b" }

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-store-bitcast-v128i1.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-store-bitcast-v128i1.ll
deleted file mode 100644
index d8d24a052660..000000000000
--- a/llvm/test/CodeGen/Hexagon/autohvx/isel-store-bitcast-v128i1.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
-
-; Primarily check if this compiles without failing.
-
-; CHECK-LABEL: fred:
-; CHECK: memd
-define void @fred(<128 x i8> %a0, <128 x i8> %a1, i128* %a2) #0 {
-  %v0 = icmp eq <128 x i8> %a0, %a1
-  %v1 = bitcast <128 x i1> %v0 to i128
-  store i128 %v1, i128* %a2, align 16
-  ret void
-}
-
-attributes #0 = { nounwind "target-cpu"="hexagonv66" "target-features"="+v66,+hvx,+hvxv66,+hvx-length128b" }
-

diff  --git a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
index 0834424ee4dc..6aae095440d6 100644
--- a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll
@@ -1,47 +1,51 @@
 ; RUN: llc -march=hexagon  < %s | FileCheck %s
 
-; Test that LLVM does not assert and bitcast v64i1 to i64 is lowered
-; without crashing.
-; CHECK: valign
+; Test that LLVM does not assert and bitcast v64i1 to i64 is lowered.
+
+; CHECK: v[[REG1:[0-9]+]] = valign(v{{[0-9]+}},v{{[0-9]+}},#2)
+; CHECK: v[[REG2:[0-9]+]] = vor(v{{[0-9]+}},v[[REG1]])
+; CHECK: v[[REG3:[0-9]+]] = valign(v[[REG2]],v[[REG2]],#1)
+; CHECK: v[[REG4:[0-9]+]] = vor(v{{[0-9]+}},v[[REG3]])
+; CHECK: v[[REG5:[0-9]+]] = vand(v[[REG4]],v{{[0-9]+}})
+; CHECK: v{{[0-9]+}}.w = vasl(v[[REG5]].w,v{{[0-9]+}}.w)
 
 target triple = "hexagon"
 
-define dso_local void @f0() local_unnamed_addr #0 {
-b0:
-  br i1 undef, label %b2, label %b1
-
-b1:                                               ; preds = %b0
-  %v0 = load i8, i8* undef, align 1
-  %v1 = zext i8 %v0 to i32
-  %v2 = add nsw i32 %v1, -1
-  %v3 = insertelement <64 x i32> undef, i32 %v2, i32 0
-  %v4 = shufflevector <64 x i32> %v3, <64 x i32> undef, <64 x i32> zeroinitializer
-  %v5 = icmp ule <64 x i32> undef, %v4
-  %v6 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* nonnull undef, i32 1, <64 x i1> %v5, <64 x i8> undef)
-  %v7 = lshr <64 x i8> %v6, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
-  %v8 = and <64 x i8> %v7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %v9 = zext <64 x i8> %v8 to <64 x i32>
-  %v10 = add nsw <64 x i32> undef, %v9
-  %v11 = select <64 x i1> %v5, <64 x i32> %v10, <64 x i32> undef
-  %v12 = add <64 x i32> %v11, undef
-  %v13 = add <64 x i32> %v12, undef
-  %v14 = add <64 x i32> %v13, undef
-  %v15 = add <64 x i32> %v14, undef
-  %v16 = add <64 x i32> %v15, undef
-  %v17 = add <64 x i32> %v16, undef
-  %v18 = add <64 x i32> %v17, undef
-  %v19 = extractelement <64 x i32> %v18, i32 0
-  %v20 = getelementptr inbounds i8, i8* null, i32 2160
-  %v21 = bitcast i8* %v20 to i32*
-  store i32 %v19, i32* %v21, align 4
-  br label %b2
-
-b2:                                               ; preds = %b1, %b0
+define dso_local void @fun() local_unnamed_addr #0 {
+entry:
+  br i1 undef, label %cleanup, label %if.end
+
+if.end:
+  %0 = load i8, i8* undef, align 1
+  %conv13.i = zext i8 %0 to i32
+  %trip.count.minus.1216 = add nsw i32 %conv13.i, -1
+  %broadcast.splatinsert221 = insertelement <64 x i32> undef, i32 %trip.count.minus.1216, i32 0
+  %broadcast.splat222 = shufflevector <64 x i32> %broadcast.splatinsert221, <64 x i32> undef, <64 x i32> zeroinitializer
+  %1 = icmp ule <64 x i32> undef, %broadcast.splat222
+  %wide.masked.load223 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* nonnull undef, i32 1, <64 x i1> %1, <64 x i8> undef)
+  %2 = lshr <64 x i8> %wide.masked.load223, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %3 = and <64 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %4 = zext <64 x i8> %3 to <64 x i32>
+  %5 = add nsw <64 x i32> undef, %4
+  %6 = select <64 x i1> %1, <64 x i32> %5, <64 x i32> undef
+  %bin.rdx225 = add <64 x i32> %6, undef
+  %bin.rdx227 = add <64 x i32> %bin.rdx225, undef
+  %bin.rdx229 = add <64 x i32> %bin.rdx227, undef
+  %bin.rdx231 = add <64 x i32> %bin.rdx229, undef
+  %bin.rdx233 = add <64 x i32> %bin.rdx231, undef
+  %bin.rdx235 = add <64 x i32> %bin.rdx233, undef
+  %bin.rdx237 = add <64 x i32> %bin.rdx235, undef
+  %7 = extractelement <64 x i32> %bin.rdx237, i32 0
+  %nChans = getelementptr inbounds i8, i8* null, i32 2160
+  %8 = bitcast i8* %nChans to i32*
+  store i32 %7, i32* %8, align 4
+  br label %cleanup
+
+cleanup:
   ret void
 }
 
 ; Function Attrs: argmemonly nounwind readonly willreturn
-declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32 immarg, <64 x i1>, <64 x i8>) #1
+declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
 
 attributes #0 = { "target-features"="+hvx-length64b,+hvxv67,+v67,-long-calls" }
-attributes #1 = { argmemonly nounwind readonly willreturn }

diff  --git a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
index 407a30cb738d..0c7949787c14 100644
--- a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
+++ b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll
@@ -1,47 +1,47 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc  < %s | FileCheck %s
 
 ; This test checks that store a vector predicate of type v128i1 is lowered
-; without crashing.
-; CHECK: valign
+; and two double stores are generated.
+
+; CHECK-DAG: memd(r{{[0-9]+}}+#0) = r{{[0-9]+}}:{{[0-9]+}}
+; CHECK-DAG: memd(r{{[0-9]+}}+#8) = r{{[0-9]+}}:{{[0-9]+}}
 
 target triple = "hexagon"
 
-define dso_local void @f0() local_unnamed_addr #0 {
-b0:
-  br i1 undef, label %b2, label %b1
-
-b1:                                               ; preds = %b0
-  %v0 = load i8, i8* undef, align 1
-  %v1 = zext i8 %v0 to i32
-  %v2 = add nsw i32 %v1, -1
-  %v3 = insertelement <128 x i32> undef, i32 %v2, i32 0
-  %v4 = shufflevector <128 x i32> %v3, <128 x i32> undef, <128 x i32> zeroinitializer
-  %v5 = icmp ule <128 x i32> undef, %v4
-  %v6 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* nonnull undef, i32 1, <128 x i1> %v5, <128 x i8> undef)
-  %v7 = lshr <128 x i8> %v6, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
-  %v8 = and <128 x i8> %v7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %v9 = zext <128 x i8> %v8 to <128 x i32>
-  %v10 = add nsw <128 x i32> undef, %v9
-  %v11 = select <128 x i1> %v5, <128 x i32> %v10, <128 x i32> undef
-  %v12 = add <128 x i32> %v11, undef
-  %v13 = add <128 x i32> %v12, undef
-  %v14 = add <128 x i32> %v13, undef
-  %v15 = add <128 x i32> %v14, undef
-  %v16 = add <128 x i32> %v15, undef
-  %v17 = add <128 x i32> %v16, undef
-  %v18 = add <128 x i32> %v17, undef
-  %v19 = extractelement <128 x i32> %v18, i32 0
-  %v20 = getelementptr inbounds i8, i8* null, i32 2160
-  %v21 = bitcast i8* %v20 to i32*
-  store i32 %v19, i32* %v21, align 4
-  br label %b2
-
-b2:                                               ; preds = %b1, %b0
+define dso_local void @raac_UnpackADIFHeader() local_unnamed_addr #0 {
+entry:
+  br i1 undef, label %cleanup, label %if.end
+
+if.end:
+  %0 = load i8, i8* undef, align 1
+  %conv13.i = zext i8 %0 to i32
+  %trip.count.minus.1216 = add nsw i32 %conv13.i, -1
+  %broadcast.splatinsert221 = insertelement <128 x i32> undef, i32 %trip.count.minus.1216, i32 0
+  %broadcast.splat222 = shufflevector <128 x i32> %broadcast.splatinsert221, <128 x i32> undef, <128 x i32> zeroinitializer
+  %1 = icmp ule <128 x i32> undef, %broadcast.splat222
+  %wide.masked.load223 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* nonnull undef, i32 1, <128 x i1> %1, <128 x i8> undef)
+  %2 = lshr <128 x i8> %wide.masked.load223, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %3 = and <128 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %4 = zext <128 x i8> %3 to <128 x i32>
+  %5 = add nsw <128 x i32> undef, %4
+  %6 = select <128 x i1> %1, <128 x i32> %5, <128 x i32> undef
+  %bin.rdx225 = add <128 x i32> %6, undef
+  %bin.rdx227 = add <128 x i32> %bin.rdx225, undef
+  %bin.rdx229 = add <128 x i32> %bin.rdx227, undef
+  %bin.rdx231 = add <128 x i32> %bin.rdx229, undef
+  %bin.rdx233 = add <128 x i32> %bin.rdx231, undef
+  %bin.rdx235 = add <128 x i32> %bin.rdx233, undef
+  %bin.rdx237 = add <128 x i32> %bin.rdx235, undef
+  %7 = extractelement <128 x i32> %bin.rdx237, i32 0
+  %nChans = getelementptr inbounds i8, i8* null, i32 2160
+  %8 = bitcast i8* %nChans to i32*
+  store i32 %7, i32* %8, align 4
+  br label %cleanup
+
+cleanup:
   ret void
-}
+  }
 
-; Function Attrs: argmemonly nounwind readonly willreturn
-declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #1
+declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>)
 
 attributes #0 = { "target-features"="+hvx-length128b,+hvxv67,+v67,-long-calls" }
-attributes #1 = { argmemonly nounwind readonly willreturn }