[llvm] 6a6ac3b - [Hexagon] Support BUILD_VECTOR of floating point HVX vectors

Tue Dec 28 15:01:29 PST 2021

Author: Krzysztof Parzyszek
Date: 2021-12-28T14:59:08-08:00
New Revision: 6a6ac3b36fcdb44a5096f2ddab952a1281eb144e

URL: https://github.com/llvm/llvm-project/commit/6a6ac3b36fcdb44a5096f2ddab952a1281eb144e
DIFF: https://github.com/llvm/llvm-project/commit/6a6ac3b36fcdb44a5096f2ddab952a1281eb144e.diff

LOG: [Hexagon] Support BUILD_VECTOR of floating point HVX vectors

Co-authored-by: Anirudh Sundar Subramaniam <quic_sanirudh at quicinc.com>
Co-authored-by: Ankit Aggarwal <aankit at quicinc.com>

Added: 
    llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll

Modified: 
    llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
    llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
    llvm/lib/Target/Hexagon/HexagonPatterns.td
    llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
    llvm/lib/Target/Hexagon/HexagonRegisterInfo.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 88effed9f0767..90dda37a886ab 100644

--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2720,7 +2720,6 @@ SDValue
 HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
       const {
   if (Ty.isVector()) {
-    assert(Ty.isInteger() && "Only integer vectors are supported here");
     unsigned W = Ty.getSizeInBits();
     if (W <= 64)
       return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W)));

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index f7237f496aee3..e189b0b49e345 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -55,6 +55,11 @@ HexagonTargetLowering::initializeHVXLowering() {
     addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
+    if (Subtarget.useHVXV68Ops() && Subtarget.useHVXFloatingPoint()) {
+      addRegisterClass(MVT::v32f32, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v64f16, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v64f32, &Hexagon::HvxWRRegClass);
+    }
   }
 
   // Set up operation actions.
@@ -83,6 +88,21 @@ HexagonTargetLowering::initializeHVXLowering() {
   setOperationAction(ISD::VECTOR_SHUFFLE,     ByteW,      Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
+      Subtarget.useHVXFloatingPoint()) {
+    // Handle ISD::BUILD_VECTOR for v32f32 in a custom way to generate vsplat
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v32f32, Custom);
+
+    // BUILD_VECTOR with f16 operands cannot be promoted without
+    // promoting the result, so lower the node to vsplat or constant pool
+    setOperationAction(ISD::BUILD_VECTOR,      MVT::f16,    Custom);
+
+    // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
+    // independent) handling of it would convert it to a load, which is
+    // not always the optimal choice.
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v64f32, Custom);
+  }
+
   for (MVT T : LegalV) {
     setIndexedLoadAction(ISD::POST_INC,  T, Legal);
     setIndexedStoreAction(ISD::POST_INC, T, Legal);
@@ -497,7 +517,9 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
   assert(ElemSize*VecLen == HwLen);
   SmallVector<SDValue,32> Words;
 
-  if (VecTy.getVectorElementType() != MVT::i32) {
+  if (VecTy.getVectorElementType() != MVT::i32 &&
+      !(Subtarget.useHVXFloatingPoint() &&
+      VecTy.getVectorElementType() == MVT::f32)) {
     assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size");
     unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2;
     MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord);
@@ -506,22 +528,31 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
       Words.push_back(DAG.getBitcast(MVT::i32, W));
     }
   } else {
-    Words.assign(Values.begin(), Values.end());
+    for (SDValue V : Values)
+      Words.push_back(DAG.getBitcast(MVT::i32, V));
   }
+  auto isSplat = [] (ArrayRef<SDValue> Values, SDValue &SplatV) {
+    unsigned NumValues = Values.size();
+    assert(NumValues > 0);
+    bool IsUndef = true;
+    for (unsigned i = 0; i != NumValues; ++i) {
+      if (Values[i].isUndef())
+        continue;
+      IsUndef = false;
+      if (!SplatV.getNode())
+        SplatV = Values[i];
+      else if (SplatV != Values[i])
+        return false;
+    }
+    if (IsUndef)
+      SplatV = Values[0];
+    return true;
+  };
 
   unsigned NumWords = Words.size();
-  bool IsSplat = true, IsUndef = true;
   SDValue SplatV;
-  for (unsigned i = 0; i != NumWords && IsSplat; ++i) {
-    if (isUndef(Words[i]))
-      continue;
-    IsUndef = false;
-    if (!SplatV.getNode())
-      SplatV = Words[i];
-    else if (SplatV != Words[i])
-      IsSplat = false;
-  }
-  if (IsUndef)
+  bool IsSplat = isSplat(Words, SplatV);
+  if (IsSplat && isUndef(SplatV))
     return DAG.getUNDEF(VecTy);
   if (IsSplat) {
     assert(SplatV.getNode());
@@ -634,8 +665,15 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
 
   HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy,
                        {HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)});
-  SDValue DstV = DAG.getNode(ISD::OR, dl, VecTy, {HalfV0, HalfV1});
-  return DstV;
+
+  SDValue T0 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV0);
+  SDValue T1 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV1);
+
+  SDValue DstV = DAG.getNode(ISD::OR, dl, ty(T0), {T0, T1});
+
+  SDValue OutV =
+      DAG.getBitcast(tyVector(ty(DstV), VecTy.getVectorElementType()), DstV);
+  return OutV;
 }
 
 SDValue
@@ -1237,6 +1275,19 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
   if (VecTy.getVectorElementType() == MVT::i1)
     return buildHvxVectorPred(Ops, dl, VecTy, DAG);
 
+  // In case of MVT::f16 BUILD_VECTOR, since MVT::f16 is
+  // not a legal type, just bitcast the node to use i16
+  // types and bitcast the result back to f16
+  if (VecTy.getVectorElementType() == MVT::f16) {
+    SmallVector<SDValue,64> NewOps;
+    for (unsigned i = 0; i != Size; i++)
+      NewOps.push_back(DAG.getBitcast(MVT::i16, Ops[i]));
+
+    SDValue T0 = DAG.getNode(ISD::BUILD_VECTOR, dl,
+        tyVector(VecTy, MVT::i16), NewOps);
+    return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
+  }
+
   if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
     ArrayRef<SDValue> A(Ops);
     MVT SingleTy = typeSplit(VecTy).first;

diff  --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index cad5ca8ab92ec..4ba6d4740e126 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -87,18 +87,6 @@ def V8I8:   PatLeaf<(v8i8    DoubleRegs:$R)>;
 def V4I16:  PatLeaf<(v4i16   DoubleRegs:$R)>;
 def V2I32:  PatLeaf<(v2i32   DoubleRegs:$R)>;
 
-def HQ8:    PatLeaf<(VecQ8   HvxQR:$R)>;
-def HQ16:   PatLeaf<(VecQ16  HvxQR:$R)>;
-def HQ32:   PatLeaf<(VecQ32  HvxQR:$R)>;
-
-def HVI8:   PatLeaf<(VecI8   HvxVR:$R)>;
-def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
-def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
-
-def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
-def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
-def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
-
 def SDTVecLeaf:
   SDTypeProfile<1, 0, [SDTCisVec<0>]>;
 def SDTVecVecIntOp:

diff  --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index a22a3f8ec0caa..15fa659d26aba 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -6,6 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+def HQ8:    PatLeaf<(VecQ8   HvxQR:$R)>;
+def HQ16:   PatLeaf<(VecQ16  HvxQR:$R)>;
+def HQ32:   PatLeaf<(VecQ32  HvxQR:$R)>;
+
+def HVI8:   PatLeaf<(VecI8   HvxVR:$R)>;
+def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
+def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
+def HVF16:  PatLeaf<(VecF16  HvxVR:$R)>;
+def HVF32:  PatLeaf<(VecF32  HvxVR:$R)>;
+
+def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
+def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
+def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
+def HWF16:  PatLeaf<(VecPF16 HvxWR:$R)>;
+def HWF32:  PatLeaf<(VecPF32 HvxWR:$R)>;
 
 def SDTVecUnaryOp:
   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
@@ -211,6 +226,24 @@ let Predicates = [UseHVX] in {
   defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
 }
 
+let Predicates = [UseHVX, UseHVXFloatingPoint] in {
+  defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI8,   VecF32,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF32,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF32,  HvxVR>;
+  defm: NopCast_pat<VecF16,  VecF32,  HvxVR>;
+
+  defm: NopCast_pat<VecPI8,  VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI8,  VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPF16, VecPF32, HvxWR>;
+}
+
 let Predicates = [UseHVX] in {
   let AddedComplexity = 100 in {
     // These should be preferred over a vsplat of 0.
@@ -251,6 +284,13 @@ let Predicates = [UseHVX] in {
            (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
 }
 
+let Predicates = [UseHVXFloatingPoint] in {
+  def: Pat<(HexagonVINSERTW0 HVF16:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVF32:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+}
+
 // Splats for HvxV60
 def V60splatib: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatB $V)))>;
 def V60splatih: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatH $V)))>;

diff  --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index 8b7138d3c809d..4c387c8ba638b 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -479,6 +479,10 @@ def VecI16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v32i16, v64i16,  v32i16]>;
 def VecI32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v16i32, v32i32,  v16i32]>;
+def VecF16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32f16, v64f16,  v32f16]>;
+def VecF32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v16f32, v32f32,  v16f32]>;
 
 def VecPI8:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v128i8, v256i8,  v128i8]>;
@@ -486,6 +490,10 @@ def VecPI16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v64i16, v128i16, v64i16]>;
 def VecPI32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v32i32, v64i32,  v32i32]>;
+def VecPF16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v64f16, v128f16, v64f16]>;
+def VecPF32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32f32, v64f32,  v32f32]>;
 
 def VecQ8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v64i1,  v128i1,  v64i1]>;
@@ -496,13 +504,13 @@ def VecQ32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
 
 // HVX register classes
 
-def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512,
+def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32, VecF16, VecF32], 512,
   (add (sequence "V%u", 0, 31), VTMP)> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>;
 }
 
-def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024,
+def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32, VecPF16, VecPF32], 1024,
   (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>;

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll
new file mode 100644
index 0000000000000..2eba9e2db446c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-float-type.ll
@@ -0,0 +1,504 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this code does compile.
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+; CHECK-LABEL: f0:
+; CHECK: vinsert
+define <32 x float> @f0(i32* %a0, float* %a1) #0 {
+b0:
+  %v0 = getelementptr i32, i32* %a0, i32 0
+  %v1 = load i32, i32* %v0, align 4
+  %v2 = getelementptr float, float* %a1, i32 %v1
+  %v3 = load float, float* %v2, align 4
+  %v4 = insertelement <32 x float> undef, float %v3, i32 0
+  %v5 = getelementptr i32, i32* %a0, i32 1
+  %v6 = load i32, i32* %v5, align 4
+  %v7 = getelementptr float, float* %a1, i32 %v6
+  %v8 = load float, float* %v7, align 4
+  %v9 = insertelement <32 x float> %v4, float %v8, i32 1
+  %v10 = getelementptr i32, i32* %a0, i32 2
+  %v11 = load i32, i32* %v10, align 4
+  %v12 = getelementptr float, float* %a1, i32 %v11
+  %v13 = load float, float* %v12, align 4
+  %v14 = insertelement <32 x float> %v9, float %v13, i32 2
+  %v15 = getelementptr i32, i32* %a0, i32 3
+  %v16 = load i32, i32* %v15, align 4
+  %v17 = getelementptr float, float* %a1, i32 %v16
+  %v18 = load float, float* %v17, align 4
+  %v19 = insertelement <32 x float> %v14, float %v18, i32 3
+  %v20 = getelementptr i32, i32* %a0, i32 4
+  %v21 = load i32, i32* %v20, align 4
+  %v22 = getelementptr float, float* %a1, i32 %v21
+  %v23 = load float, float* %v22, align 4
+  %v24 = insertelement <32 x float> %v19, float %v23, i32 4
+  %v25 = getelementptr i32, i32* %a0, i32 5
+  %v26 = load i32, i32* %v25, align 4
+  %v27 = getelementptr float, float* %a1, i32 %v26
+  %v28 = load float, float* %v27, align 4
+  %v29 = insertelement <32 x float> %v24, float %v28, i32 5
+  %v30 = getelementptr i32, i32* %a0, i32 6
+  %v31 = load i32, i32* %v30, align 4
+  %v32 = getelementptr float, float* %a1, i32 %v31
+  %v33 = load float, float* %v32, align 4
+  %v34 = insertelement <32 x float> %v29, float %v33, i32 6
+  %v35 = getelementptr i32, i32* %a0, i32 7
+  %v36 = load i32, i32* %v35, align 4
+  %v37 = getelementptr float, float* %a1, i32 %v36
+  %v38 = load float, float* %v37, align 4
+  %v39 = insertelement <32 x float> %v34, float %v38, i32 7
+  %v40 = getelementptr i32, i32* %a0, i32 8
+  %v41 = load i32, i32* %v40, align 4
+  %v42 = getelementptr float, float* %a1, i32 %v41
+  %v43 = load float, float* %v42, align 4
+  %v44 = insertelement <32 x float> %v39, float %v43, i32 8
+  %v45 = getelementptr i32, i32* %a0, i32 9
+  %v46 = load i32, i32* %v45, align 4
+  %v47 = getelementptr float, float* %a1, i32 %v46
+  %v48 = load float, float* %v47, align 4
+  %v49 = insertelement <32 x float> %v44, float %v48, i32 9
+  %v50 = getelementptr i32, i32* %a0, i32 10
+  %v51 = load i32, i32* %v50, align 4
+  %v52 = getelementptr float, float* %a1, i32 %v51
+  %v53 = load float, float* %v52, align 4
+  %v54 = insertelement <32 x float> %v49, float %v53, i32 10
+  %v55 = getelementptr i32, i32* %a0, i32 11
+  %v56 = load i32, i32* %v55, align 4
+  %v57 = getelementptr float, float* %a1, i32 %v56
+  %v58 = load float, float* %v57, align 4
+  %v59 = insertelement <32 x float> %v54, float %v58, i32 11
+  %v60 = getelementptr i32, i32* %a0, i32 12
+  %v61 = load i32, i32* %v60, align 4
+  %v62 = getelementptr float, float* %a1, i32 %v61
+  %v63 = load float, float* %v62, align 4
+  %v64 = insertelement <32 x float> %v59, float %v63, i32 12
+  %v65 = getelementptr i32, i32* %a0, i32 13
+  %v66 = load i32, i32* %v65, align 4
+  %v67 = getelementptr float, float* %a1, i32 %v66
+  %v68 = load float, float* %v67, align 4
+  %v69 = insertelement <32 x float> %v64, float %v68, i32 13
+  %v70 = getelementptr i32, i32* %a0, i32 14
+  %v71 = load i32, i32* %v70, align 4
+  %v72 = getelementptr float, float* %a1, i32 %v71
+  %v73 = load float, float* %v72, align 4
+  %v74 = insertelement <32 x float> %v69, float %v73, i32 14
+  %v75 = getelementptr i32, i32* %a0, i32 15
+  %v76 = load i32, i32* %v75, align 4
+  %v77 = getelementptr float, float* %a1, i32 %v76
+  %v78 = load float, float* %v77, align 4
+  %v79 = insertelement <32 x float> %v74, float %v78, i32 15
+  %v80 = getelementptr i32, i32* %a0, i32 16
+  %v81 = load i32, i32* %v80, align 4
+  %v82 = getelementptr float, float* %a1, i32 %v81
+  %v83 = load float, float* %v82, align 4
+  %v84 = insertelement <32 x float> %v79, float %v83, i32 16
+  %v85 = getelementptr i32, i32* %a0, i32 17
+  %v86 = load i32, i32* %v85, align 4
+  %v87 = getelementptr float, float* %a1, i32 %v86
+  %v88 = load float, float* %v87, align 4
+  %v89 = insertelement <32 x float> %v84, float %v88, i32 17
+  %v90 = getelementptr i32, i32* %a0, i32 18
+  %v91 = load i32, i32* %v90, align 4
+  %v92 = getelementptr float, float* %a1, i32 %v91
+  %v93 = load float, float* %v92, align 4
+  %v94 = insertelement <32 x float> %v89, float %v93, i32 18
+  %v95 = getelementptr i32, i32* %a0, i32 19
+  %v96 = load i32, i32* %v95, align 4
+  %v97 = getelementptr float, float* %a1, i32 %v96
+  %v98 = load float, float* %v97, align 4
+  %v99 = insertelement <32 x float> %v94, float %v98, i32 19
+  %v100 = getelementptr i32, i32* %a0, i32 20
+  %v101 = load i32, i32* %v100, align 4
+  %v102 = getelementptr float, float* %a1, i32 %v101
+  %v103 = load float, float* %v102, align 4
+  %v104 = insertelement <32 x float> %v99, float %v103, i32 20
+  %v105 = getelementptr i32, i32* %a0, i32 21
+  %v106 = load i32, i32* %v105, align 4
+  %v107 = getelementptr float, float* %a1, i32 %v106
+  %v108 = load float, float* %v107, align 4
+  %v109 = insertelement <32 x float> %v104, float %v108, i32 21
+  %v110 = getelementptr i32, i32* %a0, i32 22
+  %v111 = load i32, i32* %v110, align 4
+  %v112 = getelementptr float, float* %a1, i32 %v111
+  %v113 = load float, float* %v112, align 4
+  %v114 = insertelement <32 x float> %v109, float %v113, i32 22
+  %v115 = getelementptr i32, i32* %a0, i32 23
+  %v116 = load i32, i32* %v115, align 4
+  %v117 = getelementptr float, float* %a1, i32 %v116
+  %v118 = load float, float* %v117, align 4
+  %v119 = insertelement <32 x float> %v114, float %v118, i32 23
+  %v120 = getelementptr i32, i32* %a0, i32 24
+  %v121 = load i32, i32* %v120, align 4
+  %v122 = getelementptr float, float* %a1, i32 %v121
+  %v123 = load float, float* %v122, align 4
+  %v124 = insertelement <32 x float> %v119, float %v123, i32 24
+  %v125 = getelementptr i32, i32* %a0, i32 25
+  %v126 = load i32, i32* %v125, align 4
+  %v127 = getelementptr float, float* %a1, i32 %v126
+  %v128 = load float, float* %v127, align 4
+  %v129 = insertelement <32 x float> %v124, float %v128, i32 25
+  %v130 = getelementptr i32, i32* %a0, i32 26
+  %v131 = load i32, i32* %v130, align 4
+  %v132 = getelementptr float, float* %a1, i32 %v131
+  %v133 = load float, float* %v132, align 4
+  %v134 = insertelement <32 x float> %v129, float %v133, i32 26
+  %v135 = getelementptr i32, i32* %a0, i32 27
+  %v136 = load i32, i32* %v135, align 4
+  %v137 = getelementptr float, float* %a1, i32 %v136
+  %v138 = load float, float* %v137, align 4
+  %v139 = insertelement <32 x float> %v134, float %v138, i32 27
+  %v140 = getelementptr i32, i32* %a0, i32 28
+  %v141 = load i32, i32* %v140, align 4
+  %v142 = getelementptr float, float* %a1, i32 %v141
+  %v143 = load float, float* %v142, align 4
+  %v144 = insertelement <32 x float> %v139, float %v143, i32 28
+  %v145 = getelementptr i32, i32* %a0, i32 29
+  %v146 = load i32, i32* %v145, align 4
+  %v147 = getelementptr float, float* %a1, i32 %v146
+  %v148 = load float, float* %v147, align 4
+  %v149 = insertelement <32 x float> %v144, float %v148, i32 29
+  %v150 = getelementptr i32, i32* %a0, i32 30
+  %v151 = load i32, i32* %v150, align 4
+  %v152 = getelementptr float, float* %a1, i32 %v151
+  %v153 = load float, float* %v152, align 4
+  %v154 = insertelement <32 x float> %v149, float %v153, i32 30
+  %v155 = getelementptr i32, i32* %a0, i32 31
+  %v156 = load i32, i32* %v155, align 4
+  %v157 = getelementptr float, float* %a1, i32 %v156
+  %v158 = load float, float* %v157, align 4
+  %v159 = insertelement <32 x float> %v154, float %v158, i32 31
+  ret <32 x float> %v159
+}
+
+; Function Attrs: nounwind
+; CHECK-LABEL: f1:
+; CHECK: vinsert
+define <64 x half> @f1(i32* %a0, half* %a1) #0 {
+b0:
+  %v0 = getelementptr i32, i32* %a0, i32 0
+  %v1 = load i32, i32* %v0, align 4
+  %v2 = getelementptr half, half* %a1, i32 %v1
+  %v3 = load half, half* %v2, align 4
+  %v4 = insertelement <64 x half> undef, half %v3, i32 0
+  %v5 = getelementptr i32, i32* %a0, i32 1
+  %v6 = load i32, i32* %v5, align 4
+  %v7 = getelementptr half, half* %a1, i32 %v6
+  %v8 = load half, half* %v7, align 4
+  %v9 = insertelement <64 x half> %v4, half %v8, i32 1
+  %v10 = getelementptr i32, i32* %a0, i32 2
+  %v11 = load i32, i32* %v10, align 4
+  %v12 = getelementptr half, half* %a1, i32 %v11
+  %v13 = load half, half* %v12, align 4
+  %v14 = insertelement <64 x half> %v9, half %v13, i32 2
+  %v15 = getelementptr i32, i32* %a0, i32 3
+  %v16 = load i32, i32* %v15, align 4
+  %v17 = getelementptr half, half* %a1, i32 %v16
+  %v18 = load half, half* %v17, align 4
+  %v19 = insertelement <64 x half> %v14, half %v18, i32 3
+  %v20 = getelementptr i32, i32* %a0, i32 4
+  %v21 = load i32, i32* %v20, align 4
+  %v22 = getelementptr half, half* %a1, i32 %v21
+  %v23 = load half, half* %v22, align 4
+  %v24 = insertelement <64 x half> %v19, half %v23, i32 4
+  %v25 = getelementptr i32, i32* %a0, i32 5
+  %v26 = load i32, i32* %v25, align 4
+  %v27 = getelementptr half, half* %a1, i32 %v26
+  %v28 = load half, half* %v27, align 4
+  %v29 = insertelement <64 x half> %v24, half %v28, i32 5
+  %v30 = getelementptr i32, i32* %a0, i32 6
+  %v31 = load i32, i32* %v30, align 4
+  %v32 = getelementptr half, half* %a1, i32 %v31
+  %v33 = load half, half* %v32, align 4
+  %v34 = insertelement <64 x half> %v29, half %v33, i32 6
+  %v35 = getelementptr i32, i32* %a0, i32 7
+  %v36 = load i32, i32* %v35, align 4
+  %v37 = getelementptr half, half* %a1, i32 %v36
+  %v38 = load half, half* %v37, align 4
+  %v39 = insertelement <64 x half> %v34, half %v38, i32 7
+  %v40 = getelementptr i32, i32* %a0, i32 8
+  %v41 = load i32, i32* %v40, align 4
+  %v42 = getelementptr half, half* %a1, i32 %v41
+  %v43 = load half, half* %v42, align 4
+  %v44 = insertelement <64 x half> %v39, half %v43, i32 8
+  %v45 = getelementptr i32, i32* %a0, i32 9
+  %v46 = load i32, i32* %v45, align 4
+  %v47 = getelementptr half, half* %a1, i32 %v46
+  %v48 = load half, half* %v47, align 4
+  %v49 = insertelement <64 x half> %v44, half %v48, i32 9
+  %v50 = getelementptr i32, i32* %a0, i32 10
+  %v51 = load i32, i32* %v50, align 4
+  %v52 = getelementptr half, half* %a1, i32 %v51
+  %v53 = load half, half* %v52, align 4
+  %v54 = insertelement <64 x half> %v49, half %v53, i32 10
+  %v55 = getelementptr i32, i32* %a0, i32 11
+  %v56 = load i32, i32* %v55, align 4
+  %v57 = getelementptr half, half* %a1, i32 %v56
+  %v58 = load half, half* %v57, align 4
+  %v59 = insertelement <64 x half> %v54, half %v58, i32 11
+  %v60 = getelementptr i32, i32* %a0, i32 12
+  %v61 = load i32, i32* %v60, align 4
+  %v62 = getelementptr half, half* %a1, i32 %v61
+  %v63 = load half, half* %v62, align 4
+  %v64 = insertelement <64 x half> %v59, half %v63, i32 12
+  %v65 = getelementptr i32, i32* %a0, i32 13
+  %v66 = load i32, i32* %v65, align 4
+  %v67 = getelementptr half, half* %a1, i32 %v66
+  %v68 = load half, half* %v67, align 4
+  %v69 = insertelement <64 x half> %v64, half %v68, i32 13
+  %v70 = getelementptr i32, i32* %a0, i32 14
+  %v71 = load i32, i32* %v70, align 4
+  %v72 = getelementptr half, half* %a1, i32 %v71
+  %v73 = load half, half* %v72, align 4
+  %v74 = insertelement <64 x half> %v69, half %v73, i32 14
+  %v75 = getelementptr i32, i32* %a0, i32 15
+  %v76 = load i32, i32* %v75, align 4
+  %v77 = getelementptr half, half* %a1, i32 %v76
+  %v78 = load half, half* %v77, align 4
+  %v79 = insertelement <64 x half> %v74, half %v78, i32 15
+  %v80 = getelementptr i32, i32* %a0, i32 16
+  %v81 = load i32, i32* %v80, align 4
+  %v82 = getelementptr half, half* %a1, i32 %v81
+  %v83 = load half, half* %v82, align 4
+  %v84 = insertelement <64 x half> %v79, half %v83, i32 16
+  %v85 = getelementptr i32, i32* %a0, i32 17
+  %v86 = load i32, i32* %v85, align 4
+  %v87 = getelementptr half, half* %a1, i32 %v86
+  %v88 = load half, half* %v87, align 4
+  %v89 = insertelement <64 x half> %v84, half %v88, i32 17
+  %v90 = getelementptr i32, i32* %a0, i32 18
+  %v91 = load i32, i32* %v90, align 4
+  %v92 = getelementptr half, half* %a1, i32 %v91
+  %v93 = load half, half* %v92, align 4
+  %v94 = insertelement <64 x half> %v89, half %v93, i32 18
+  %v95 = getelementptr i32, i32* %a0, i32 19
+  %v96 = load i32, i32* %v95, align 4
+  %v97 = getelementptr half, half* %a1, i32 %v96
+  %v98 = load half, half* %v97, align 4
+  %v99 = insertelement <64 x half> %v94, half %v98, i32 19
+  %v100 = getelementptr i32, i32* %a0, i32 20
+  %v101 = load i32, i32* %v100, align 4
+  %v102 = getelementptr half, half* %a1, i32 %v101
+  %v103 = load half, half* %v102, align 4
+  %v104 = insertelement <64 x half> %v99, half %v103, i32 20
+  %v105 = getelementptr i32, i32* %a0, i32 21
+  %v106 = load i32, i32* %v105, align 4
+  %v107 = getelementptr half, half* %a1, i32 %v106
+  %v108 = load half, half* %v107, align 4
+  %v109 = insertelement <64 x half> %v104, half %v108, i32 21
+  %v110 = getelementptr i32, i32* %a0, i32 22
+  %v111 = load i32, i32* %v110, align 4
+  %v112 = getelementptr half, half* %a1, i32 %v111
+  %v113 = load half, half* %v112, align 4
+  %v114 = insertelement <64 x half> %v109, half %v113, i32 22
+  %v115 = getelementptr i32, i32* %a0, i32 23
+  %v116 = load i32, i32* %v115, align 4
+  %v117 = getelementptr half, half* %a1, i32 %v116
+  %v118 = load half, half* %v117, align 4
+  %v119 = insertelement <64 x half> %v114, half %v118, i32 23
+  %v120 = getelementptr i32, i32* %a0, i32 24
+  %v121 = load i32, i32* %v120, align 4
+  %v122 = getelementptr half, half* %a1, i32 %v121
+  %v123 = load half, half* %v122, align 4
+  %v124 = insertelement <64 x half> %v119, half %v123, i32 24
+  %v125 = getelementptr i32, i32* %a0, i32 25
+  %v126 = load i32, i32* %v125, align 4
+  %v127 = getelementptr half, half* %a1, i32 %v126
+  %v128 = load half, half* %v127, align 4
+  %v129 = insertelement <64 x half> %v124, half %v128, i32 25
+  %v130 = getelementptr i32, i32* %a0, i32 26
+  %v131 = load i32, i32* %v130, align 4
+  %v132 = getelementptr half, half* %a1, i32 %v131
+  %v133 = load half, half* %v132, align 4
+  %v134 = insertelement <64 x half> %v129, half %v133, i32 26
+  %v135 = getelementptr i32, i32* %a0, i32 27
+  %v136 = load i32, i32* %v135, align 4
+  %v137 = getelementptr half, half* %a1, i32 %v136
+  %v138 = load half, half* %v137, align 4
+  %v139 = insertelement <64 x half> %v134, half %v138, i32 27
+  %v140 = getelementptr i32, i32* %a0, i32 28
+  %v141 = load i32, i32* %v140, align 4
+  %v142 = getelementptr half, half* %a1, i32 %v141
+  %v143 = load half, half* %v142, align 4
+  %v144 = insertelement <64 x half> %v139, half %v143, i32 28
+  %v145 = getelementptr i32, i32* %a0, i32 29
+  %v146 = load i32, i32* %v145, align 4
+  %v147 = getelementptr half, half* %a1, i32 %v146
+  %v148 = load half, half* %v147, align 4
+  %v149 = insertelement <64 x half> %v144, half %v148, i32 29
+  %v150 = getelementptr i32, i32* %a0, i32 30
+  %v151 = load i32, i32* %v150, align 4
+  %v152 = getelementptr half, half* %a1, i32 %v151
+  %v153 = load half, half* %v152, align 4
+  %v154 = insertelement <64 x half> %v149, half %v153, i32 30
+  %v155 = getelementptr i32, i32* %a0, i32 31
+  %v156 = load i32, i32* %v155, align 4
+  %v157 = getelementptr half, half* %a1, i32 %v156
+  %v158 = load half, half* %v157, align 4
+  %v159 = insertelement <64 x half> %v154, half %v158, i32 31
+  %v160 = getelementptr i32, i32* %a0, i32 32
+  %v161 = load i32, i32* %v160, align 4
+  %v162 = getelementptr half, half* %a1, i32 %v161
+  %v163 = load half, half* %v162, align 4
+  %v164 = insertelement <64 x half> %v159, half %v163, i32 32
+  %v165 = getelementptr i32, i32* %a0, i32 33
+  %v166 = load i32, i32* %v165, align 4
+  %v167 = getelementptr half, half* %a1, i32 %v166
+  %v168 = load half, half* %v167, align 4
+  %v169 = insertelement <64 x half> %v164, half %v168, i32 33
+  %v170 = getelementptr i32, i32* %a0, i32 34
+  %v171 = load i32, i32* %v170, align 4
+  %v172 = getelementptr half, half* %a1, i32 %v171
+  %v173 = load half, half* %v172, align 4
+  %v174 = insertelement <64 x half> %v169, half %v173, i32 34
+  %v175 = getelementptr i32, i32* %a0, i32 35
+  %v176 = load i32, i32* %v175, align 4
+  %v177 = getelementptr half, half* %a1, i32 %v176
+  %v178 = load half, half* %v177, align 4
+  %v179 = insertelement <64 x half> %v174, half %v178, i32 35
+  %v180 = getelementptr i32, i32* %a0, i32 36
+  %v181 = load i32, i32* %v180, align 4
+  %v182 = getelementptr half, half* %a1, i32 %v181
+  %v183 = load half, half* %v182, align 4
+  %v184 = insertelement <64 x half> %v179, half %v183, i32 36
+  %v185 = getelementptr i32, i32* %a0, i32 37
+  %v186 = load i32, i32* %v185, align 4
+  %v187 = getelementptr half, half* %a1, i32 %v186
+  %v188 = load half, half* %v187, align 4
+  %v189 = insertelement <64 x half> %v184, half %v188, i32 37
+  %v190 = getelementptr i32, i32* %a0, i32 38
+  %v191 = load i32, i32* %v190, align 4
+  %v192 = getelementptr half, half* %a1, i32 %v191
+  %v193 = load half, half* %v192, align 4
+  %v194 = insertelement <64 x half> %v189, half %v193, i32 38
+  %v195 = getelementptr i32, i32* %a0, i32 39
+  %v196 = load i32, i32* %v195, align 4
+  %v197 = getelementptr half, half* %a1, i32 %v196
+  %v198 = load half, half* %v197, align 4
+  %v199 = insertelement <64 x half> %v194, half %v198, i32 39
+  %v200 = getelementptr i32, i32* %a0, i32 40
+  %v201 = load i32, i32* %v200, align 4
+  %v202 = getelementptr half, half* %a1, i32 %v201
+  %v203 = load half, half* %v202, align 4
+  %v204 = insertelement <64 x half> %v199, half %v203, i32 40
+  %v205 = getelementptr i32, i32* %a0, i32 41
+  %v206 = load i32, i32* %v205, align 4
+  %v207 = getelementptr half, half* %a1, i32 %v206
+  %v208 = load half, half* %v207, align 4
+  %v209 = insertelement <64 x half> %v204, half %v208, i32 41
+  %v210 = getelementptr i32, i32* %a0, i32 42
+  %v211 = load i32, i32* %v210, align 4
+  %v212 = getelementptr half, half* %a1, i32 %v211
+  %v213 = load half, half* %v212, align 4
+  %v214 = insertelement <64 x half> %v209, half %v213, i32 42
+  %v215 = getelementptr i32, i32* %a0, i32 43
+  %v216 = load i32, i32* %v215, align 4
+  %v217 = getelementptr half, half* %a1, i32 %v216
+  %v218 = load half, half* %v217, align 4
+  %v219 = insertelement <64 x half> %v214, half %v218, i32 43
+  %v220 = getelementptr i32, i32* %a0, i32 44
+  %v221 = load i32, i32* %v220, align 4
+  %v222 = getelementptr half, half* %a1, i32 %v221
+  %v223 = load half, half* %v222, align 4
+  %v224 = insertelement <64 x half> %v219, half %v223, i32 44
+  %v225 = getelementptr i32, i32* %a0, i32 45
+  %v226 = load i32, i32* %v225, align 4
+  %v227 = getelementptr half, half* %a1, i32 %v226
+  %v228 = load half, half* %v227, align 4
+  %v229 = insertelement <64 x half> %v224, half %v228, i32 45
+  %v230 = getelementptr i32, i32* %a0, i32 46
+  %v231 = load i32, i32* %v230, align 4
+  %v232 = getelementptr half, half* %a1, i32 %v231
+  %v233 = load half, half* %v232, align 4
+  %v234 = insertelement <64 x half> %v229, half %v233, i32 46
+  %v235 = getelementptr i32, i32* %a0, i32 47
+  %v236 = load i32, i32* %v235, align 4
+  %v237 = getelementptr half, half* %a1, i32 %v236
+  %v238 = load half, half* %v237, align 4
+  %v239 = insertelement <64 x half> %v234, half %v238, i32 47
+  %v240 = getelementptr i32, i32* %a0, i32 48
+  %v241 = load i32, i32* %v240, align 4
+  %v242 = getelementptr half, half* %a1, i32 %v241
+  %v243 = load half, half* %v242, align 4
+  %v244 = insertelement <64 x half> %v239, half %v243, i32 48
+  %v245 = getelementptr i32, i32* %a0, i32 49
+  %v246 = load i32, i32* %v245, align 4
+  %v247 = getelementptr half, half* %a1, i32 %v246
+  %v248 = load half, half* %v247, align 4
+  %v249 = insertelement <64 x half> %v244, half %v248, i32 49
+  %v250 = getelementptr i32, i32* %a0, i32 50
+  %v251 = load i32, i32* %v250, align 4
+  %v252 = getelementptr half, half* %a1, i32 %v251
+  %v253 = load half, half* %v252, align 4
+  %v254 = insertelement <64 x half> %v249, half %v253, i32 50
+  %v255 = getelementptr i32, i32* %a0, i32 51
+  %v256 = load i32, i32* %v255, align 4
+  %v257 = getelementptr half, half* %a1, i32 %v256
+  %v258 = load half, half* %v257, align 4
+  %v259 = insertelement <64 x half> %v254, half %v258, i32 51
+  %v260 = getelementptr i32, i32* %a0, i32 52
+  %v261 = load i32, i32* %v260, align 4
+  %v262 = getelementptr half, half* %a1, i32 %v261
+  %v263 = load half, half* %v262, align 4
+  %v264 = insertelement <64 x half> %v259, half %v263, i32 52
+  %v265 = getelementptr i32, i32* %a0, i32 53
+  %v266 = load i32, i32* %v265, align 4
+  %v267 = getelementptr half, half* %a1, i32 %v266
+  %v268 = load half, half* %v267, align 4
+  %v269 = insertelement <64 x half> %v264, half %v268, i32 53
+  %v270 = getelementptr i32, i32* %a0, i32 54
+  %v271 = load i32, i32* %v270, align 4
+  %v272 = getelementptr half, half* %a1, i32 %v271
+  %v273 = load half, half* %v272, align 4
+  %v274 = insertelement <64 x half> %v269, half %v273, i32 54
+  %v275 = getelementptr i32, i32* %a0, i32 55
+  %v276 = load i32, i32* %v275, align 4
+  %v277 = getelementptr half, half* %a1, i32 %v276
+  %v278 = load half, half* %v277, align 4
+  %v279 = insertelement <64 x half> %v274, half %v278, i32 55
+  %v280 = getelementptr i32, i32* %a0, i32 56
+  %v281 = load i32, i32* %v280, align 4
+  %v282 = getelementptr half, half* %a1, i32 %v281
+  %v283 = load half, half* %v282, align 4
+  %v284 = insertelement <64 x half> %v279, half %v283, i32 56
+  %v285 = getelementptr i32, i32* %a0, i32 57
+  %v286 = load i32, i32* %v285, align 4
+  %v287 = getelementptr half, half* %a1, i32 %v286
+  %v288 = load half, half* %v287, align 4
+  %v289 = insertelement <64 x half> %v284, half %v288, i32 57
+  %v290 = getelementptr i32, i32* %a0, i32 58
+  %v291 = load i32, i32* %v290, align 4
+  %v292 = getelementptr half, half* %a1, i32 %v291
+  %v293 = load half, half* %v292, align 4
+  %v294 = insertelement <64 x half> %v289, half %v293, i32 58
+  %v295 = getelementptr i32, i32* %a0, i32 59
+  %v296 = load i32, i32* %v295, align 4
+  %v297 = getelementptr half, half* %a1, i32 %v296
+  %v298 = load half, half* %v297, align 4
+  %v299 = insertelement <64 x half> %v294, half %v298, i32 59
+  %v300 = getelementptr i32, i32* %a0, i32 60
+  %v301 = load i32, i32* %v300, align 4
+  %v302 = getelementptr half, half* %a1, i32 %v301
+  %v303 = load half, half* %v302, align 4
+  %v304 = insertelement <64 x half> %v299, half %v303, i32 60
+  %v305 = getelementptr i32, i32* %a0, i32 61
+  %v306 = load i32, i32* %v305, align 4
+  %v307 = getelementptr half, half* %a1, i32 %v306
+  %v308 = load half, half* %v307, align 4
+  %v309 = insertelement <64 x half> %v304, half %v308, i32 61
+  %v310 = getelementptr i32, i32* %a0, i32 62
+  %v311 = load i32, i32* %v310, align 4
+  %v312 = getelementptr half, half* %a1, i32 %v311
+  %v313 = load half, half* %v312, align 4
+  %v314 = insertelement <64 x half> %v309, half %v313, i32 62
+  %v315 = getelementptr i32, i32* %a0, i32 63
+  %v316 = load i32, i32* %v315, align 4
+  %v317 = getelementptr half, half* %a1, i32 %v316
+  %v318 = load half, half* %v317, align 4
+  %v319 = insertelement <64 x half> %v314, half %v318, i32 63
+  ret <64 x half> %v319
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv69" "target-features"="+hvxv69,+hvx-length128b,+hvx-qfloat" }