[llvm] Add HVX vgather/vscatter Support (PR #164421)

Tue Oct 21 08:58:08 PDT 2025

https://github.com/fhossein-quic updated https://github.com/llvm/llvm-project/pull/164421

>From 4288f1ee828dc154724ffd8b3fbc0f59a6dffa97 Mon Sep 17 00:00:00 2001
From: Sergei Larin <slarin at codeaurora.org>
Date: Mon, 11 Nov 2024 14:08:54 -0800
Subject: [PATCH] Add HVX vgather/vscatter support

This patch adds HVX vgather/vscatter genertion for i16, i32, and i8.
It also adds a flag to control generation of scatter/gather
instructions for HVX. Default to "disable".

Co-authored-by: Sergei Larin <slarin at quicinc.com>
Co-authored-by: Maxime Schmitt <maxime.schmitt at qti.qualcomm.com>
---
 llvm/include/llvm/IR/IntrinsicsHexagon.td     |  12 +
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    |   4 +-
 .../Target/Hexagon/HexagonISelDAGToDAGHVX.cpp |   4 +
 .../Target/Hexagon/HexagonISelLowering.cpp    |   4 +-
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  | 162 ++--
 llvm/lib/Target/Hexagon/HexagonPatternsV65.td |  13 +
 .../Hexagon/HexagonTargetTransformInfo.cpp    |  59 ++
 .../Hexagon/HexagonTargetTransformInfo.h      |   6 +
 .../Target/Hexagon/HexagonVectorCombine.cpp   | 878 +++++++++++++++++-
 .../autohvx/ripple_scalarize_scatter.ll       |  68 ++
 .../CodeGen/Hexagon/autohvx/ripple_vgather.ll |  55 ++
 .../Hexagon/autohvx/ripple_vgather_SpVV.ll    |  54 ++
 .../Hexagon/autohvx/ripple_vscatter.ll        |  52 ++
 llvm/test/CodeGen/Hexagon/masked_gather.ll    |  58 ++
 llvm/test/CodeGen/Hexagon/vector-gather.ll    |  27 +
 15 files changed, 1379 insertions(+), 77 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/masked_gather.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/vector-gather.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 67b873d16cb5a..20ba51ade35a7 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -447,3 +447,15 @@ def int_hexagon_instrprof_custom
 
 
 include "llvm/IR/IntrinsicsHexagonDep.td"
+
+class Hexagon__ptri32i32v64i16_Intrinsic<string GCCIntSuffix,
+      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,llvm_v64i16_ty],
+       intr_properties>;
+
+def int_hexagon_V6_vgather_vscattermh :
+Hexagon__ptri32i32v64i16_Intrinsic<"HEXAGON_V6_vgather_vscattermh", [IntrArgMemOnly]>;
+
+def int_hexagon_V6_vgather_vscattermh_128B :
+Hexagon__ptri32i32v32i32_Intrinsic<"HEXAGON_V6_vgather_vscattermh_128B", [IntrArgMemOnly]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index e285e04543694..7ee280d8fc8b0 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
       IntNo == Intrinsic::hexagon_V6_vgathermh ||
       IntNo == Intrinsic::hexagon_V6_vgathermh_128B ||
       IntNo == Intrinsic::hexagon_V6_vgathermhw ||
-      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) {
+      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B ||
+      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh ||
+      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) {
     SelectV65Gather(N);
     return;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c7a4f6803a243..3cc146b13f8f8 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   case Intrinsic::hexagon_V6_vgathermhw_128B:
     Opcode = Hexagon::V6_vgathermhw_pseudo;
     break;
+  case Intrinsic::hexagon_V6_vgather_vscattermh:
+  case Intrinsic::hexagon_V6_vgather_vscattermh_128B:
+    Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo;
+    break;
   }
 
   SDVTList VTs = CurDAG->getVTList(MVT::Other);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 9f7f434b66fa1..526b4de975915 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::hexagon_V6_vgathermhq:
   case Intrinsic::hexagon_V6_vgathermhq_128B:
   case Intrinsic::hexagon_V6_vgathermhwq:
-  case Intrinsic::hexagon_V6_vgathermhwq_128B: {
+  case Intrinsic::hexagon_V6_vgathermhwq_128B:
+  case Intrinsic::hexagon_V6_vgather_vscattermh:
+  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: {
     const Module &M = *I.getParent()->getParent()->getParent();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Type *VecTy = I.getArgOperand(1)->getType();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 939841ae817c3..47726d6447ad8 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {
   MachineBasicBlock::iterator First;
 
   switch (Opc) {
-    case Hexagon::V6_vgathermh_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermw_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhw_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermwq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhwq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
+  case Hexagon::V6_vgather_vscatter_mh_pseudo:
+    // This is mainly a place holder. It will be extended.
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+  case Hexagon::V6_vgathermh_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermw_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhw_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermwq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhwq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
   }
 
   return MI.getIterator();
@@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vL32b_nt_tmp_npred_ai:
   case Hexagon::V6_vS32Ub_npred_ai:
   case Hexagon::V6_vgathermh_pseudo:
+  case Hexagon::V6_vgather_vscatter_mh_pseudo:
   case Hexagon::V6_vgathermw_pseudo:
   case Hexagon::V6_vgathermhw_pseudo:
   case Hexagon::V6_vgathermhq_pseudo:
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
index f927f9b9e7c34..42393d081f1a7 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
@@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo  : vgathermh<HvxVR>;
 defm V6_vgathermw_pseudo  : vgathermw<HvxVR>;
 defm V6_vgathermhw_pseudo  : vgathermhw<HvxWR>;
 
+
+multiclass vgather_scatter_mh<RegisterClass RC> {
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
+  def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
+                           ".error \"should not emit\" ",
+                           []>;
+}
+
+defm V6_vgather_vscatter_mh_pseudo  : vgather_scatter_mh<HvxVR>;
+
 multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {
   let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
   mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 171e2949366ad..e925e041eb64e 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -31,6 +31,10 @@ using namespace llvm;
 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
     cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
 
+cl::opt<bool> HexagonAllowScatterGatherHVX(
+    "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden,
+    cl::desc("Allow auto-generation of HVX scatter-gather"));
+
 static cl::opt<bool> EnableV68FloatAutoHVX(
     "force-hvx-float", cl::Hidden,
     cl::desc("Enable auto-vectorization of floatint point types on v68."));
@@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
   return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
 }
 
+bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
+  // For now assume we can not deal with all HVX datatypes.
+  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+      !HexagonAllowScatterGatherHVX)
+    return false;
+  // This must be in sync with HexagonVectorCombine pass.
+  switch (Ty->getScalarSizeInBits()) {
+  case 8:
+    return (getTypeNumElements(Ty) == 128);
+  case 16:
+    if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32)
+      return (Alignment >= 2);
+    break;
+  case 32:
+    if (getTypeNumElements(Ty) == 32)
+      return (Alignment >= 4);
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const {
+  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+      !HexagonAllowScatterGatherHVX)
+    return false;
+  // This must be in sync with HexagonVectorCombine pass.
+  switch (Ty->getScalarSizeInBits()) {
+  case 8:
+    return (getTypeNumElements(Ty) == 128);
+  case 16:
+    if (getTypeNumElements(Ty) == 64)
+      return (Alignment >= 2);
+    break;
+  case 32:
+    if (getTypeNumElements(Ty) == 32)
+      return (Alignment >= 4);
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy,
+                                                Align Alignment) const {
+  return !isLegalMaskedGather(VTy, Alignment);
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy,
+                                                 Align Alignment) const {
+  return !isLegalMaskedScatter(VTy, Alignment);
+}
+
 /// --- Vector TTI end ---
 
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index dbf16c99c314c..cec2bf9656ffc 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -169,6 +169,12 @@ class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
                           unsigned AddressSpace) const override;
   bool isLegalMaskedLoad(Type *DataType, Align Alignment,
                          unsigned AddressSpace) const override;
+  bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;
+  bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override;
+  bool forceScalarizeMaskedGather(VectorType *VTy,
+                                  Align Alignment) const override;
+  bool forceScalarizeMaskedScatter(VectorType *VTy,
+                                   Align Alignment) const override;
 
   /// @}
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 9ab52020e2e36..1f68fa4ff1356 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -57,6 +57,11 @@
 
 #define DEBUG_TYPE "hexagon-vc"
 
+// This is a const that represents default HVX VTCM page size.
+// It is boot time configurable, so we probably want an API to
+// read it, but for now assume 128KB
+#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
+
 using namespace llvm;
 
 namespace {
@@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
 
 class HvxIdioms {
 public:
+  enum DstQualifier {
+    Undefined = 0,
+    Arithmetic,
+    LdSt,
+    LLVM_Gather,
+    LLVM_Scatter,
+    HEX_Gather_Scatter,
+    HEX_Gather,
+    HEX_Scatter,
+    Call
+  };
+
   HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
     auto *Int32Ty = HVC.getIntTy(32);
     HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
@@ -473,6 +490,11 @@ class HvxIdioms {
   auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
                      Signedness SgnX, ArrayRef<Value *> WordY,
                      Signedness SgnY) const -> SmallVector<Value *>;
+  // Vector manipulations for Ripple
+  bool matchScatter(Instruction &In) const;
+  bool matchGather(Instruction &In) const;
+  Value *processVScatter(Instruction &In) const;
+  Value *processVGather(Instruction &In) const;
 
   VectorType *HvxI32Ty;
   VectorType *HvxP32Ty;
@@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
 }
 
 auto AlignVectors::run() -> bool {
-  LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName()
+  LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
                     << '\n');
   if (!createAddressGroups())
     return false;
@@ -1797,6 +1819,840 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
   return Ext;
 }
 
+inline bool HvxIdioms::matchScatter(Instruction &In) const {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+  if (!II)
+    return false;
+  return (II->getIntrinsicID() == Intrinsic::masked_scatter);
+}
+
+inline bool HvxIdioms::matchGather(Instruction &In) const {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+  if (!II)
+    return false;
+  return (II->getIntrinsicID() == Intrinsic::masked_gather);
+}
+
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
+
+// Binary instructions we want to handle as users of gather/scatter.
+inline bool isArithmetic(unsigned Opc) {
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl:
+  case Instruction::UDiv:
+    return true;
+  }
+  return false;
+}
+
+// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
+inline Value *getPointer(Value *Ptr) {
+  assert(Ptr && "Unable to extract pointer");
+  if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
+    return Ptr;
+  if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
+    return getLoadStorePointerOperand(Ptr);
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_store)
+      return II->getOperand(1);
+  }
+  return nullptr;
+}
+
+static Instruction *selectDestination(Instruction *In,
+                                      HvxIdioms::DstQualifier &Qual) {
+  Instruction *Destination = nullptr;
+  if (!In)
+    return Destination;
+  if (isa<StoreInst>(In)) {
+    Destination = In;
+    Qual = HvxIdioms::LdSt;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_gather) {
+      Destination = In;
+      Qual = HvxIdioms::LLVM_Gather;
+    } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
+      Destination = In;
+      Qual = HvxIdioms::LLVM_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
+      Destination = In;
+      Qual = HvxIdioms::LdSt;
+    } else if (II->getIntrinsicID() ==
+               Intrinsic::hexagon_V6_vgather_vscattermh) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Gather_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Gather;
+    }
+  } else if (isa<ZExtInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isa<CastInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isa<CallInst>(In)) {
+    Destination = In;
+    Qual = HvxIdioms::Call;
+  } else if (isa<GetElementPtrInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isArithmetic(In->getOpcode())) {
+    Destination = In;
+    Qual = HvxIdioms::Arithmetic;
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
+  }
+  return Destination;
+}
+
+// This method attempts to find destination (user) for a given intrinsic.
+// Given that these are produced only by Ripple, the number of options is
+// limited. Simplest case is explicit store which in fact is redundant (since
+// HVX gater creates its own store during packetization). Nevertheless we need
+// to figure address where we storing. Other cases are more complicated, but
+// still few.
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
+  Instruction *Destination = nullptr;
+  if (!In)
+    return Destination;
+  // Get all possible destinations
+  SmallVector<Instruction *> Users;
+  // Iterate over the uses of the instruction
+  for (auto &U : In->uses()) {
+    if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
+      Destination = selectDestination(UI, Qual);
+      if (Destination)
+        Users.push_back(Destination);
+    }
+  }
+  // Now see which of the users (if any) is a memory destination.
+  for (auto *I : Users)
+    if (getPointer(I))
+      return I;
+  return Destination;
+}
+
+// The two intrinsics we handle here have GEP in a different position.
+inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {
+  assert(In && "Bad instruction");
+  IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In);
+  assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
+                  IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
+         "Not a gather Intrinsic");
+  GetElementPtrInst *GEPIndex = nullptr;
+  if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
+    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
+  else
+    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
+  return GEPIndex;
+}
+
+// Given the intrinsic find its GEP argument and extract base address it uses.
+// The method relies on the way how Ripple typically forms the GEP for
+// scatter/gather.
+static Value *locateAddressFromIntrinsic(Instruction *In) {
+  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+  if (!GEPIndex) {
+    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");
+    return nullptr;
+  }
+  Value *BaseAddress = GEPIndex->getPointerOperand();
+  auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
+  if (IndexLoad)
+    return IndexLoad;
+
+  auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
+  if (IndexZEx) {
+    IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
+    if (IndexLoad)
+      return IndexLoad;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
+    if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+      return locateAddressFromIntrinsic(II);
+  }
+  auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
+  if (BaseShuffle) {
+    IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
+    if (IndexLoad)
+      return IndexLoad;
+    auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
+    if (IE) {
+      auto *Src = IE->getOperand(1);
+      IndexLoad = dyn_cast<LoadInst>(Src);
+      if (IndexLoad)
+        return IndexLoad;
+      auto *Alloca = dyn_cast<AllocaInst>(Src);
+      if (Alloca)
+        return Alloca;
+      if (isa<Argument>(Src)) {
+        return Src;
+      }
+      if (isa<GlobalValue>(Src)) {
+        return Src;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "  Unable to locate Address from intrinsic\n");
+  return nullptr;
+}
+
+static Type *getIndexType(Value *In) {
+  if (!In)
+    return nullptr;
+
+  if (isa<LoadInst>(In) || isa<StoreInst>(In))
+    return getLoadStoreType(In);
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return II->getType();
+    if (II->getIntrinsicID() == Intrinsic::masked_store)
+      return II->getOperand(0)->getType();
+  }
+  return In->getType();
+}
+
+static Value *locateIndexesFromGEP(Value *In) {
+  if (!In)
+    return nullptr;
+  if (isa<LoadInst>(In))
+    return In;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return In;
+    if (II->getIntrinsicID() == Intrinsic::masked_gather)
+      return In;
+  }
+  if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
+    return locateIndexesFromGEP(IndexZEx->getOperand(0));
+  if (auto *IndexSEx = dyn_cast<SExtInst>(In))
+    return locateIndexesFromGEP(IndexSEx->getOperand(0));
+  if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
+    return locateIndexesFromGEP(BaseShuffle->getOperand(0));
+  if (auto *IE = dyn_cast<InsertElementInst>(In))
+    return locateIndexesFromGEP(IE->getOperand(1));
+  if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
+    return cstDataVector;
+  if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
+    return GEPIndex->getOperand(0);
+  return nullptr;
+}
+
+// Given the intrinsic find its GEP argument and extract offsetts from the base
+// address it uses.
+static Value *locateIndexesFromIntrinsic(Instruction *In) {
+  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+  if (!GEPIndex) {
+    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");
+    return nullptr;
+  }
+  Value *Indexes = GEPIndex->getOperand(1);
+  if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
+    return IndexLoad;
+
+  LLVM_DEBUG(dbgs() << "  Unable to locate Index from intrinsic\n");
+  return nullptr;
+}
+
+// Because of aukward definition of many Hex intrinsics we often have to
+// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
+// for all use cases, so this only exist to make IR builder happy.
+inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
+                                               IRBuilderBase &Builder,
+                                               LLVMContext &Ctx, Value *I) {
+  assert(I && "Unable to reinterprete cast");
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  std::vector<unsigned> shuffleMask;
+  for (unsigned i = 0; i < 64; ++i)
+    shuffleMask.push_back(i);
+  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+  Value *CastShuffle =
+      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+  return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
+}
+
+// Recast <128 x i8> as <32 x i32>
+inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
+                                              IRBuilderBase &Builder,
+                                              LLVMContext &Ctx, Value *I) {
+  assert(I && "Unable to reinterprete cast");
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  std::vector<unsigned> shuffleMask;
+  for (unsigned i = 0; i < 128; ++i)
+    shuffleMask.push_back(i);
+  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+  Value *CastShuffle =
+      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+  return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
+}
+
+// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
+inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
+                           IRBuilderBase &Builder, LLVMContext &Ctx,
+                           unsigned int pattern) {
+  std::vector<unsigned int> byteMask;
+  for (unsigned i = 0; i < 32; ++i)
+    byteMask.push_back(pattern);
+
+  return Builder.CreateIntrinsic(
+      HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
+      {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
+      nullptr);
+}
+
+Value *HvxIdioms::processVScatter(Instruction &In) const {
+  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
+  assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
+  unsigned InpSize = HVC.getSizeOf(InpTy);
+  unsigned Elements = HVC.length(InpTy);
+  auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
+  assert(ElemTy && "llvm.scatter needs integer type argument");
+  unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
+  auto *F = In.getFunction();
+  LLVMContext &Ctx = F->getContext();
+  LLVM_DEBUG(dbgs() << "\n[Process scatter](" << In << ")\n"
+                    << *In.getParent() << "\n");
+  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements(" << Elements
+                    << ") VecLen(" << InpSize << ") type(" << *ElemTy
+                    << ") ElemWidth(" << ElemWidth << ")\n");
+
+  IRBuilder Builder(In.getParent(), In.getIterator(),
+                    InstSimplifyFolder(HVC.DL));
+
+  auto *ValueToScatter = In.getOperand(0);
+  LLVM_DEBUG(dbgs() << "  ValueToScatter   : " << *ValueToScatter << "\n");
+
+  if (HVC.HST.getVectorLength() != InpSize) {
+    LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
+                      << ") for vscatter\n");
+    return nullptr;
+  }
+
+  // Base address of indexes.
+  auto *IndexLoad = locateAddressFromIntrinsic(&In);
+  if (!IndexLoad)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  IndexLoad        : " << *IndexLoad << "\n");
+
+  // Address of destination. Must be in VTCM.
+  auto *Ptr = getPointer(IndexLoad);
+  if (!Ptr)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Ptr              : " << *Ptr << "\n");
+  // Indexes/offsets
+  auto *Indexes = locateIndexesFromIntrinsic(&In);
+  if (!Indexes)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Indexes          : " << *Indexes << "\n");
+  Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
+                                                    "cst_ptr_to_i32");
+  LLVM_DEBUG(dbgs() << "  CastedDst        : " << *CastedDst << "\n");
+  // Adjust Indexes
+  auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+  Value *CastIndex = nullptr;
+  if (cstDataVector) {
+    // Our indexes are represented as a constant. We need it in a reg.
+    AllocaInst *IndexesAlloca =
+        Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
+    auto *StoreIndexes = Builder.CreateStore(cstDataVector, IndexesAlloca);
+    LLVM_DEBUG(dbgs() << "  StoreIndexes     : " << *StoreIndexes << "\n");
+    CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
+                                   IndexesAlloca, "reload_index");
+  } else {
+    if (ElemWidth == 2)
+      CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+    else
+      CastIndex = Indexes;
+  }
+  LLVM_DEBUG(dbgs() << "  Cast index       : " << *CastIndex << ")\n");
+
+  if (ElemWidth == 1) {
+    // v128i8 There is no native instruction for this.
+    // Do this as two Hi/Lo gathers with masking.
+    Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+    // Extend indexes. We assume that indexes are in 128i8 format - need to
+    // expand them to Hi/Lo 64i16
+    Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
+    auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+    auto *UnpackedIndexes = Builder.CreateIntrinsic(
+        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
+    LLVM_DEBUG(dbgs() << "  UnpackedIndexes  : " << *UnpackedIndexes << ")\n");
+
+    auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+    auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+    Value *IndexHi =
+        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+    Value *IndexLo =
+        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+    LLVM_DEBUG(dbgs() << "  UnpackedIndHi    : " << *IndexHi << ")\n");
+    LLVM_DEBUG(dbgs() << "  UnpackedIndLo    : " << *IndexLo << ")\n");
+    // Now unpack values to scatter
+    Value *CastSrc =
+        getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
+    LLVM_DEBUG(dbgs() << "  CastSrc          : " << *CastSrc << ")\n");
+    auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
+        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
+    LLVM_DEBUG(dbgs() << "  UnpackedValToScat: " << *UnpackedValueToScatter
+                      << ")\n");
+
+    Value *UVSHi =
+        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
+    Value *UVSLo =
+        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
+    LLVM_DEBUG(dbgs() << "  UVSHi            : " << *UVSHi << ")\n");
+    LLVM_DEBUG(dbgs() << "  UVSLo            : " << *UVSLo << ")\n");
+
+    // Create the mask for individual bytes
+    auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+    LLVM_DEBUG(dbgs() << "  QByteMask        : " << *QByteMask << "\n");
+
+    auto *ResHi = Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         IndexHi, UVSHi},
+        nullptr);
+    LLVM_DEBUG(dbgs() << "  ResHi            : " << *ResHi << ")\n");
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         IndexLo, UVSLo},
+        nullptr);
+  } else if (ElemWidth == 2) {
+    Value *CastSrc =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
+    LLVM_DEBUG(dbgs() << "  CastSrc        : " << *CastSrc << ")\n");
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
+        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+         CastSrc},
+        nullptr);
+  } else if (ElemWidth == 4) {
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
+        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+         ValueToScatter},
+        nullptr);
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
+    return nullptr;
+  }
+}
+
+Value *HvxIdioms::processVGather(Instruction &In) const {
+  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
+  assert(InpTy && "Cannot handle no vector type for llvm.gather");
+  auto *ElemTy = dyn_cast<PointerType>(InpTy->getElementType());
+  assert(ElemTy && "llvm.gather needs vector of ptr argument");
+  auto *F = In.getFunction();
+  LLVMContext &Ctx = F->getContext();
+  LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
+                    << *In.getParent() << "\n");
+  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements("
+                    << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
+                    << ") type(" << *ElemTy << ") Access alignment("
+                    << *In.getOperand(1) << ") AddressSpace("
+                    << ElemTy->getAddressSpace() << ")\n");
+
+  // TODO: Handle masking of elements.
+  auto *MaskTy = dyn_cast<VectorType>(In.getOperand(2)->getType());
+  assert(MaskTy && "llvm.gather needs vector for mask");
+  IRBuilder Builder(In.getParent(), In.getIterator(),
+                    InstSimplifyFolder(HVC.DL));
+
+  // See who is using the result. The difference between LLVM and HVX vgather
+  // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
+  // in VTCM is not yet supported, so for now we just bail out for those cases.
+  HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
+  Instruction *Dst = locateDestination(&In, Qual);
+  if (!Dst) {
+    LLVM_DEBUG(dbgs() << "  Unable to locate vgather destination\n");
+    return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "  Destination    : " << *Dst << " Qual(" << Qual
+                    << ")\n");
+
+  // Address of destination. Must be in VTCM.
+  auto *Ptr = getPointer(Dst);
+  if (!Ptr) {
+    LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
+    return nullptr;
+  }
+
+  // Result type. Assume it is a vector type.
+  auto *DstType = cast<VectorType>(getIndexType(Dst));
+  assert(DstType && "Cannot handle non vector dst type for llvm.gather");
+
+  // Base address for sources to be loaded
+  auto *IndexLoad = locateAddressFromIntrinsic(&In);
+  if (!IndexLoad)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  IndexLoad      : " << *IndexLoad << "\n");
+
+  // Gather indexes/offsets
+  auto *Indexes = locateIndexesFromIntrinsic(&In);
+  if (!Indexes)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Indexes        : " << *Indexes << "\n");
+
+  Instruction *Gather = nullptr;
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
+    // We fully assume the address space is in VTCM. We also assume that all
+    // pointers in Operand(0) have the same base(!).
+    // This is the most basic case of all the above.
+    unsigned OutputSize = HVC.getSizeOf(DstType);
+    auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
+    unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
+    LLVM_DEBUG(dbgs() << "  Buffer type    : " << *Ptr->getType()
+                      << "  Address space ("
+                      << Ptr->getType()->getPointerAddressSpace() << ")\n"
+                      << "  Result type    : " << *DstType
+                      << "\n  Size in bytes  : " << OutputSize
+                      << " element type(" << *DstElemTy
+                      << ")\n  ElemWidth      : " << ElemWidth << " bytes\n");
+
+    auto *IndexType = cast<VectorType>(getIndexType(Indexes));
+    assert(IndexType && "Cannot handle non vector index type for llvm.gather");
+    unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
+    LLVM_DEBUG(dbgs() << "  IndexWidth(" << IndexWidth << ")\n");
+
+    // Intrinsic takes i32 instead of pointer so cast.
+    Value *CastedPtr = Builder.CreateBitOrPointerCast(
+        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
+    // int_hexagon_V6_vgathermh       [... , llvm_v16i32_ty]
+    // int_hexagon_V6_vgathermh_128B  [... , llvm_v32i32_ty]
+    // int_hexagon_V6_vgathermhw      [... , llvm_v32i32_ty]
+    // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
+    // int_hexagon_V6_vgathermw       [... , llvm_v16i32_ty]
+    // int_hexagon_V6_vgathermw_128B  [... , llvm_v32i32_ty]
+    if (HVC.HST.getVectorLength() == OutputSize) {
+      if (ElemWidth == 1) {
+        // v128i8 There is no native instruction for this.
+        // Do this as two Hi/Lo gathers with masking.
+        // Unpack indexes. We assume that indexes are in 128i8 format - need to
+        // expand them to Hi/Lo 64i16
+        Value *CastIndexes =
+            Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
+        auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+        auto *UnpackedIndexes =
+            Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
+                                    V6_vunpack, CastIndexes, nullptr);
+        LLVM_DEBUG(dbgs() << "  UnpackedIndexes : " << *UnpackedIndexes
+                          << ")\n");
+
+        auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+        auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+        Value *IndexHi =
+            HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+        Value *IndexLo =
+            HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+        LLVM_DEBUG(dbgs() << "  UnpackedIndHi   : " << *IndexHi << ")\n");
+        LLVM_DEBUG(dbgs() << "  UnpackedIndLo   : " << *IndexLo << ")\n");
+        // Create the mask for individual bytes
+        auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+        LLVM_DEBUG(dbgs() << "  QByteMask       : " << *QByteMask << "\n");
+        // We use our destination allocation as a temp storage
+        // This is unlikely to work properly for masked gather.
+        auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
+        auto GatherHi = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), V6_vgather,
+            {Ptr, QByteMask, CastedPtr,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
+            nullptr);
+        LLVM_DEBUG(dbgs() << "  GatherHi        : " << *GatherHi << ")\n");
+        // Rematerialize the result
+        Value *LoadedResultHi = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
+        LLVM_DEBUG(dbgs() << "  LoadedResultHi : " << *LoadedResultHi << "\n");
+        // Same for the low part. Here we use Gather to return non-NULL result
+        // from this function and continue to iterate. We also are deleting Dst
+        // store below.
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), V6_vgather,
+            {Ptr, QByteMask, CastedPtr,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
+            nullptr);
+        LLVM_DEBUG(dbgs() << "  GatherLo        : " << *Gather << ")\n");
+        Value *LoadedResultLo = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
+        LLVM_DEBUG(dbgs() << "  LoadedResultLo : " << *LoadedResultLo << "\n");
+        // Now we have properly sized bytes in every other position
+        // B b A a c a A b B c f F g G h H is presented as
+        // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
+        // Use vpack to gather them
+        auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
+        auto Res = Builder.CreateIntrinsic(
+            NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
+        LLVM_DEBUG(dbgs() << "  ScaledRes      : " << *Res << "\n");
+        auto *StoreRes = Builder.CreateStore(Res, Ptr);
+        LLVM_DEBUG(dbgs() << "  StoreRes       : " << *StoreRes << "\n");
+      } else if (ElemWidth == 2) {
+        // v32i16
+        if (IndexWidth == 2) {
+          // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
+          Value *CastIndex =
+              getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+          LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");
+          // shift all i16 left by 1 to match short addressing mode instead of
+          // byte.
+          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+          Value *AdjustedIndex = HVC.createHvxIntrinsic(
+              Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+          LLVM_DEBUG(dbgs()
+                     << "  Shifted half index: " << *AdjustedIndex << ")\n");
+
+          auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
+          // The 3rd argument is the size of the region to gather from. Probably
+          // want to set it to max VTCM size.
+          Gather = Builder.CreateIntrinsic(
+              Type::getVoidTy(Ctx), V6_vgather,
+              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+               AdjustedIndex},
+              nullptr);
+          for (auto &U : Dst->uses()) {
+            if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+              dbgs() << "    dst used by: " << *UI << "\n";
+          }
+          for (auto &U : In.uses()) {
+            if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+              dbgs() << "    In used by : " << *UI << "\n";
+          }
+          // Create temp load from result in case the result is used by any
+          // other instruction.
+          Value *LoadedResult = Builder.CreateLoad(
+              HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
+          LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+          In.replaceAllUsesWith(LoadedResult);
+        } else {
+          dbgs() << "    Unhandled index type for vgather\n";
+          return nullptr;
+        }
+      } else if (ElemWidth == 4) {
+        if (IndexWidth == 4) {
+          // v32i32
+          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+          Value *AdjustedIndex = HVC.createHvxIntrinsic(
+              Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
+          LLVM_DEBUG(dbgs()
+                     << "  Shifted word index: " << *AdjustedIndex << ")\n");
+          Gather = Builder.CreateIntrinsic(
+              Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
+              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+               AdjustedIndex},
+              nullptr);
+        } else {
+          LLVM_DEBUG(dbgs() << "    Unhandled index type for vgather\n");
+          return nullptr;
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "    Unhandled element type for vgather\n");
+        return nullptr;
+      }
+    } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
+      // This is half of the reg width, duplicate low in high
+      LLVM_DEBUG(dbgs() << "    Unhandled half of register size\n");
+      return nullptr;
+    } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
+      LLVM_DEBUG(dbgs() << "    Unhandle twice the register size\n");
+      return nullptr;
+    }
+    // Erase the original intrinsic and store that consumes it.
+    // HVX will create a pseudo for gather that is expanded to gather + store
+    // during packetization.
+    Dst->eraseFromParent();
+  } else if (Qual == HvxIdioms::LLVM_Scatter) {
+    // Gather feeds directly into scatter.
+    auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
+    assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
+    unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
+    unsigned DstElements = HVC.length(DstInpTy);
+    auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
+    assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
+    LLVM_DEBUG(dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
+                      << *Dst->getOperand(0) << "\n");
+    LLVM_DEBUG(dbgs() << "  Dst type(" << *DstInpTy << ") elements("
+                      << DstElements << ") VecLen(" << DstInpSize << ") type("
+                      << *DstElemTy << ") Access alignment("
+                      << *Dst->getOperand(2) << ")\n");
+    // Address of source
+    auto *Src = getPointer(IndexLoad);
+    if (!Src)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "  Src            : " << *Src << "\n");
+
+    if (!isa<PointerType>(Src->getType())) {
+      LLVM_DEBUG(dbgs() << "    Source is not a pointer type...\n");
+      return nullptr;
+    }
+
+    Value *CastedSrc = Builder.CreateBitOrPointerCast(
+        Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");
+
+    auto *DstLoad = locateAddressFromIntrinsic(Dst);
+    if (!DstLoad) {
+      LLVM_DEBUG(dbgs() << "  Unable to locate DstLoad\n");
+      return nullptr;
+    }
+    LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");
+
+    Value *Ptr = getPointer(DstLoad);
+    if (!Ptr)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");
+    Value *CastIndex =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
+    LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");
+    // Shift all i16 left by 1 to match short addressing mode instead of
+    // byte.
+    auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+    Value *AdjustedIndex = HVC.createHvxIntrinsic(
+        Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+    LLVM_DEBUG(dbgs() << "  Shifted half index: " << *AdjustedIndex << ")\n");
+
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+        {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         AdjustedIndex},
+        nullptr);
+  } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
+    // Gather feeds into previously inserted pseudo intrinsic.
+    // These could not be in the same packet, so we need to generate another
+    // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
+    // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
+    // ModRegs:$Mu, HvxVR:$Vv)
+    if (isa<AllocaInst>(IndexLoad)) {
+      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+      if (cstDataVector) {
+        // Our indexes are represented as a constant. We need THEM in a reg.
+        // This most likely will not work properly since alloca gives us DDR
+        // stack location. This will be fixed once we teach compiler about VTCM.
+        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+        auto *StoreIndexes = Builder.CreateStore(cstDataVector, IndexesAlloca);
+        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");
+        Value *LoadedIndex = Builder.CreateLoad(
+            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca << "\n");
+
+        Value *CastedSrc = Builder.CreateBitOrPointerCast(
+            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+            {ResultAlloca, CastedSrc,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+            nullptr);
+        Value *LoadedResult = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");
+        In.replaceAllUsesWith(LoadedResult);
+      }
+    } else {
+      // Address of source
+      auto *Src = getPointer(IndexLoad);
+      if (!Src)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  Src      : " << *Src << "\n");
+
+      Value *CastedSrc = Builder.CreateBitOrPointerCast(
+          Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+      LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");
+
+      auto *DstLoad = locateAddressFromIntrinsic(Dst);
+      if (!DstLoad)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");
+      auto *Ptr = getPointer(DstLoad);
+      if (!Ptr)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");
+
+      Gather = Builder.CreateIntrinsic(
+          Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
+          {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+           Indexes},
+          nullptr);
+    }
+    return Gather;
+  } else if (Qual == HvxIdioms::HEX_Scatter) {
+    // This is the case when result of a gather is used as an argument to
+    // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
+    // ourselves. We have to create alloca, store to it, and replace all uses
+    // with that.
+    AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+    Value *CastedSrc = Builder.CreateBitOrPointerCast(
+        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+    Value *CastIndex =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+    LLVM_DEBUG(dbgs() << "  Cast index     : " << *CastIndex << ")\n");
+
+    Gather = Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+        {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         CastIndex},
+        nullptr);
+    Value *LoadedResult = Builder.CreateLoad(
+        HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+    LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+    In.replaceAllUsesWith(LoadedResult);
+  } else if (Qual == HvxIdioms::HEX_Gather) {
+    // Gather feeds to another gather but already replaced with
+    // hexagon_V6_vgathermh_128B
+    if (isa<AllocaInst>(IndexLoad)) {
+      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+      if (cstDataVector) {
+        // Our indexes are represented as a constant. We need it in a reg.
+        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+
+        auto *StoreIndexes = Builder.CreateStore(cstDataVector, IndexesAlloca);
+        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");
+        Value *LoadedIndex = Builder.CreateLoad(
+            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca
+                          << "\n  AddressSpace: "
+                          << ResultAlloca->getAddressSpace() << "\n";);
+
+        Value *CastedSrc = Builder.CreateBitOrPointerCast(
+            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+            {ResultAlloca, CastedSrc,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+            nullptr);
+        Value *LoadedResult = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");
+        In.replaceAllUsesWith(LoadedResult);
+      }
+    }
+  } else if (Qual == HvxIdioms::LLVM_Gather) {
+    // Gather feeds into another gather
+    errs() << " Underimplemented vgather to vgather sequence\n";
+    return nullptr;
+  } else
+    llvm_unreachable("Unhandled Qual enum");
+
+  return Gather;
+}
+
 auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
                                      const FxpOp &Op) const -> Value * {
   assert(Op.X.Val->getType() == Op.Y.Val->getType());
@@ -2138,6 +2994,26 @@ auto HvxIdioms::run() -> bool {
         It = StartOver ? B.rbegin()
                        : cast<Instruction>(New)->getReverseIterator();
         Changed = true;
+      } else if (matchGather(*It)) {
+        Value *New = processVGather(*It);
+        if (!New)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Gather : " << *New << "\n");
+        // We replace original intrinsic with a new pseudo call.
+        It->eraseFromParent();
+        It = cast<Instruction>(New)->getReverseIterator();
+        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+        Changed = true;
+      } else if (matchScatter(*It)) {
+        Value *New = processVScatter(*It);
+        if (!New)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Scatter : " << *New << "\n");
+        // We replace original intrinsic with a new pseudo call.
+        It->eraseFromParent();
+        It = cast<Instruction>(New)->getReverseIterator();
+        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+        Changed = true;
       }
     }
   }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
new file mode 100644
index 0000000000000..99b50629c39e2
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
@@ -0,0 +1,68 @@
+; Make sure we do not assert for the cases we do not handle.
+; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s
+
+; Mainly make sure we do not core dump.
+; CHECK-NOT: scatter
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite)
+define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 {
+entry:
+  %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0
+  %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0
+  %cmp.not15 = icmp ugt i32 8, %T
+  br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp3.not8 = icmp ugt i32 8, %W
+  %conv.ripple.LS.instance = trunc i32 %W to i8
+  %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0
+  %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body5, %for.cond1.preheader
+  %add = add i32 %add17, 8
+  %cmp.not = icmp ugt i32 %add, %T
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit
+  %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ]
+  %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ]
+  br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph
+
+for.body5.lr.ph:                                  ; preds = %for.cond1.preheader
+  %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016
+  br label %for.body5
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
+  ret void
+
+for.body5:                                        ; preds = %for.body5.lr.ph, %for.body5
+  %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ]
+  %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ]
+  %gep = getelementptr i8, ptr %gep14, i32 %w.09
+  %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true))
+  %add2 = add i32 %add210, 8
+  %cmp3.not = icmp ugt i32 %add2, %W
+  br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv75" "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
new file mode 100644
index 0000000000000..83fd63ebc037c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_gather_32:
+; CHECK: vtmp.w = vgather
+; CHECK-LABEL: Ripple_gather_16:
+; CHECK: vtmp.h = vgather
+; CHECK-LABEL: Ripple_gather_8:
+; CHECK: vand
+; CHECK: vpacke
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %0 = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0
+  %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i32> poison)
+  store <32 x i32> %1, ptr %destination, align 4
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %0 = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %1, ptr %destination, align 2
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %0 = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <128 x i8> poison)
+  store <128 x i8> %1, ptr %destination, align 1
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1
+declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1
+declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
new file mode 100644
index 0000000000000..1bd79d7a46556
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
@@ -0,0 +1,54 @@
+; Verify that we generate HVX vgather for the given input.
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+; CHECK-LABEL: SpVV_Ripple:
+; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 {
+entry:
+  %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0
+  %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %div16 = lshr i32 %nS, 6
+  %cmp6.not = icmp ult i32 %nS, 64
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ]
+  %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ]
+  %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ]
+  %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ]
+  %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2
+  %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2
+  %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2
+  %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32>
+  %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32>
+  %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance
+  %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized
+  %add9 = add nuw nsw i32 %_ripple_block_0.07, 1
+  %scevgep = getelementptr i8, ptr %lsr.iv, i32 128
+  %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128
+  %cmp = icmp ult i32 %add9, %div16
+  br i1 %cmp, label %for.body, label %for.end
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ]
+  %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf
+  %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19
+  %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21
+  %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23
+  %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25
+  %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27
+  %0 = extractelement <64 x i32> %bin.rdx28, i32 0
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
new file mode 100644
index 0000000000000..85d299948b729
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_scatter_8:
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK-LABEL: Ripple_scatter_16:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}}
+; CHECK-LABEL: Ripple_scatter_32:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}}
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1
+  %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance
+  %cst_ptr_to_i32 = ptrtoint ptr %destination to i32
+  tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2
+  %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4
+  %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2
+declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2
+declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2
diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll
new file mode 100644
index 0000000000000..1c25ca1f8bdbc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll
@@ -0,0 +1,58 @@
+; This produced masked gather that we are not yet handling
+; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,+hvx-length128b,+hvxv68,+v68,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Original C++
+; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp  -mno-packets -mv68
+;typedef struct poptContext_s * poptContext;
+;typedef struct { unsigned int bits[1]; } pbm_set;
+;struct poptContext_s { pbm_set * arg_strip; };
+;
+;int poptStrippedArgv(poptContext con, int argc, char ** argv) {
+;  int numargs = argc;
+;   for (int i = 1; i < argc; i++) {
+;     if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0))
+;     numargs--;
+;   }
+;    return numargs;
+;}
+
+; CHECK-NOT: masked_gather
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown-unknown-elf"
+
+; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none)
+define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %argc, 1
+  br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr %con, align 4
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %numargs.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ]
+  %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ]
+  %div7 = lshr i32 %i.010, 5
+  %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7
+  %1 = load i32, ptr %arrayidx, align 4
+  %rem = and i32 %i.010, 31
+  %shl = shl nuw i32 1, %rem
+  %and = and i32 %1, %shl
+  %cmp1.not = icmp ne i32 %and, 0
+  %dec = sext i1 %cmp1.not to i32
+  %spec.select = add nsw i32 %numargs.09, %dec
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %argc
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !3
+}
diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll
new file mode 100644
index 0000000000000..5700380508a57
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll
@@ -0,0 +1,27 @@
+; REQUIRES: hexagon-registered-target
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+target triple = "hexagon"
+
+ at VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4
+ at region_len = dso_local global i32 16383, align 4
+
+; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS)
+; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 {
+entry:
+  %vgather.addr = alloca ptr, align 4
+  %offsets.addr = alloca <32 x i32>, align 128
+  store ptr %vgather, ptr %vgather.addr, align 4
+  store <32 x i32> %offsets, ptr %offsets.addr, align 128
+  %0 = load ptr, ptr %vgather.addr, align 4
+  %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4
+  %2 = load i32, ptr @region_len, align 4
+  %3 = load <32 x i32>, ptr %offsets.addr, align 128
+  call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)