[flang-commits] [flang] d652884 - [flang] Add PowerPC vec_extract, vec_insert, vec_mergeh, vec_mergel, vec_perm and ver_permi intrinsic

Kelvin Li via flang-commits flang-commits at lists.llvm.org
Thu Aug 10 13:20:48 PDT 2023


Author: Kelvin Li
Date: 2023-08-10T16:20:05-04:00
New Revision: d652884eb2d42c7d28109ef302554f8939a3f227

URL: https://github.com/llvm/llvm-project/commit/d652884eb2d42c7d28109ef302554f8939a3f227
DIFF: https://github.com/llvm/llvm-project/commit/d652884eb2d42c7d28109ef302554f8939a3f227.diff

LOG: [flang] Add PowerPC vec_extract, vec_insert, vec_mergeh, vec_mergel, vec_perm and ver_permi intrinsic

Co-authored-by: Paul Scoropan <1paulscoropan at gmail.com>
Co-authored-by: Mark Danial <Mark.Danial at ibm.com>

Differential Revision: https://reviews.llvm.org/D156197

Added: 
    flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90
    flang/test/Lower/PowerPC/ppc-vec-extract.f90
    flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90
    flang/test/Lower/PowerPC/ppc-vec-insert.f90
    flang/test/Lower/PowerPC/ppc-vec-merge-elem-order.f90
    flang/test/Lower/PowerPC/ppc-vec-merge.f90
    flang/test/Lower/PowerPC/ppc-vec-perm-elem-order.f90
    flang/test/Lower/PowerPC/ppc-vec-perm.f90

Modified: 
    flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
    flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
    flang/lib/Semantics/check-call.cpp
    flang/module/__ppc_intrinsics.f90
    flang/test/Semantics/PowerPC/ppc-vector-intrinsics.f90

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
index 4df8ee9e3145cb..981dd382f62bcb 100644
--- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
@@ -29,9 +29,13 @@ enum class VecOp {
   Convert,
   Ctf,
   Cvf,
-  Nmadd,
+  Mergeh,
+  Mergel,
   Msub,
   Mul,
+  Nmadd,
+  Perm,
+  Permi,
   Sel,
   Sl,
   Sld,
@@ -167,6 +171,20 @@ struct PPCIntrinsicLibrary : IntrinsicLibrary {
   fir::ExtendedValue genVecAnyCompare(mlir::Type resultType,
                                       llvm::ArrayRef<fir::ExtendedValue> args);
 
+  fir::ExtendedValue genVecExtract(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args);
+
+  fir::ExtendedValue genVecInsert(mlir::Type resultType,
+                                  llvm::ArrayRef<fir::ExtendedValue> args);
+
+  template <VecOp>
+  fir::ExtendedValue genVecMerge(mlir::Type resultType,
+                                 llvm::ArrayRef<fir::ExtendedValue> args);
+
+  template <VecOp>
+  fir::ExtendedValue genVecPerm(mlir::Type resultType,
+                                llvm::ArrayRef<fir::ExtendedValue> args);
+
   template <VecOp>
   fir::ExtendedValue genVecNmaddMsub(mlir::Type resultType,
                                      llvm::ArrayRef<fir::ExtendedValue> args);

diff  --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index 7f7cf850c433ad..ec8488d03418fa 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -121,6 +121,24 @@ static constexpr IntrinsicHandler ppcHandlers[]{
          &PI::genVecConvert<VecOp::Cvf>),
      {{{"arg1", asValue}}},
      /*isElemental=*/true},
+    {"__ppc_vec_extract",
+     static_cast<IntrinsicLibrary::ExtendedGenerator>(&PI::genVecExtract),
+     {{{"arg1", asValue}, {"arg2", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_vec_insert",
+     static_cast<IntrinsicLibrary::ExtendedGenerator>(&PI::genVecInsert),
+     {{{"arg1", asValue}, {"arg2", asValue}, {"arg3", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_vec_mergeh",
+     static_cast<IntrinsicLibrary::ExtendedGenerator>(
+         &PI::genVecMerge<VecOp::Mergeh>),
+     {{{"arg1", asValue}, {"arg2", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_vec_mergel",
+     static_cast<IntrinsicLibrary::ExtendedGenerator>(
+         &PI::genVecMerge<VecOp::Mergel>),
+     {{{"arg1", asValue}, {"arg2", asValue}}},
+     /*isElemental=*/true},
     {"__ppc_vec_msub",
      static_cast<IntrinsicLibrary::ExtendedGenerator>(
          &PI::genVecNmaddMsub<VecOp::Msub>),
@@ -136,6 +154,16 @@ static constexpr IntrinsicHandler ppcHandlers[]{
          &PI::genVecNmaddMsub<VecOp::Nmadd>),
      {{{"arg1", asValue}, {"arg2", asValue}, {"arg3", asValue}}},
      /*isElemental=*/true},
+    {"__ppc_vec_perm",
+     static_cast<IntrinsicLibrary::ExtendedGenerator>(
+         &PI::genVecPerm<VecOp::Perm>),
+     {{{"arg1", asValue}, {"arg2", asValue}, {"arg3", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_vec_permi",
+     static_cast<IntrinsicLibrary::ExtendedGenerator>(
+         &PI::genVecPerm<VecOp::Permi>),
+     {{{"arg1", asValue}, {"arg2", asValue}, {"arg3", asValue}}},
+     /*isElemental=*/true},
     {"__ppc_vec_sel",
      static_cast<IntrinsicLibrary::ExtendedGenerator>(&PI::genVecSel),
      {{{"arg1", asValue}, {"arg2", asValue}, {"arg3", asValue}}},
@@ -1066,6 +1094,150 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
   }
 }
 
+static mlir::Value convertVectorElementOrder(fir::FirOpBuilder &builder,
+                                             mlir::Location loc,
+                                             VecTypeInfo vecInfo,
+                                             mlir::Value idx) {
+  mlir::Value numSub1{
+      builder.createIntegerConstant(loc, idx.getType(), vecInfo.len - 1)};
+  return builder.create<mlir::LLVM::SubOp>(loc, idx.getType(), numSub1, idx);
+}
+
+// VEC_EXTRACT
+fir::ExtendedValue
+PPCIntrinsicLibrary::genVecExtract(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  auto argBases{getBasesForArgs(args)};
+  auto argTypes{getTypesForArgs(argBases)};
+  auto vecTyInfo{getVecTypeFromFir(argBases[0])};
+
+  auto mlirTy{vecTyInfo.toMlirVectorType(builder.getContext())};
+  auto varg0{builder.createConvert(loc, mlirTy, argBases[0])};
+
+  // arg2 modulo the number of elements in arg1 to determine the element
+  // position
+  auto numEle{builder.createIntegerConstant(loc, argTypes[1], vecTyInfo.len)};
+  mlir::Value uremOp{
+      builder.create<mlir::LLVM::URemOp>(loc, argBases[1], numEle)};
+
+  if (!isNativeVecElemOrderOnLE())
+    uremOp = convertVectorElementOrder(builder, loc, vecTyInfo, uremOp);
+
+  return builder.create<mlir::vector::ExtractElementOp>(loc, varg0, uremOp);
+}
+
+// VEC_INSERT
+fir::ExtendedValue
+PPCIntrinsicLibrary::genVecInsert(mlir::Type resultType,
+                                  llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  auto argBases{getBasesForArgs(args)};
+  auto argTypes{getTypesForArgs(argBases)};
+  auto vecTyInfo{getVecTypeFromFir(argBases[1])};
+  auto mlirTy{vecTyInfo.toMlirVectorType(builder.getContext())};
+  auto varg1{builder.createConvert(loc, mlirTy, argBases[1])};
+
+  auto numEle{builder.createIntegerConstant(loc, argTypes[2], vecTyInfo.len)};
+  mlir::Value uremOp{
+      builder.create<mlir::LLVM::URemOp>(loc, argBases[2], numEle)};
+
+  if (!isNativeVecElemOrderOnLE())
+    uremOp = convertVectorElementOrder(builder, loc, vecTyInfo, uremOp);
+
+  auto res{builder.create<mlir::vector::InsertElementOp>(loc, argBases[0],
+                                                         varg1, uremOp)};
+  return builder.create<fir::ConvertOp>(loc, vecTyInfo.toFirVectorType(), res);
+}
+
+// VEC_MERGEH, VEC_MERGEL
+template <VecOp vop>
+fir::ExtendedValue
+PPCIntrinsicLibrary::genVecMerge(mlir::Type resultType,
+                                 llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  auto argBases{getBasesForArgs(args)};
+  auto vecTyInfo{getVecTypeFromFir(argBases[0])};
+  llvm::SmallVector<int64_t, 16> mMask; // native vector element order mask
+  llvm::SmallVector<int64_t, 16> rMask; // non-native vector element order mask
+
+  switch (vop) {
+  case VecOp::Mergeh: {
+    switch (vecTyInfo.len) {
+    case 2: {
+      enum { V1 = 0, V2 = 2 };
+      mMask = {V1 + 0, V2 + 0};
+      rMask = {V2 + 1, V1 + 1};
+      break;
+    }
+    case 4: {
+      enum { V1 = 0, V2 = 4 };
+      mMask = {V1 + 0, V2 + 0, V1 + 1, V2 + 1};
+      rMask = {V2 + 2, V1 + 2, V2 + 3, V1 + 3};
+      break;
+    }
+    case 8: {
+      enum { V1 = 0, V2 = 8 };
+      mMask = {V1 + 0, V2 + 0, V1 + 1, V2 + 1, V1 + 2, V2 + 2, V1 + 3, V2 + 3};
+      rMask = {V2 + 4, V1 + 4, V2 + 5, V1 + 5, V2 + 6, V1 + 6, V2 + 7, V1 + 7};
+      break;
+    }
+    case 16:
+      mMask = {0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13,
+               0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17};
+      rMask = {0x18, 0x08, 0x19, 0x09, 0x1A, 0x0A, 0x1B, 0x0B,
+               0x1C, 0x0C, 0x1D, 0x0D, 0x1E, 0x0E, 0x1F, 0x0F};
+      break;
+    default:
+      llvm_unreachable("unexpected vector length");
+    }
+    break;
+  }
+  case VecOp::Mergel: {
+    switch (vecTyInfo.len) {
+    case 2: {
+      enum { V1 = 0, V2 = 2 };
+      mMask = {V1 + 1, V2 + 1};
+      rMask = {V2 + 0, V1 + 0};
+      break;
+    }
+    case 4: {
+      enum { V1 = 0, V2 = 4 };
+      mMask = {V1 + 2, V2 + 2, V1 + 3, V2 + 3};
+      rMask = {V2 + 0, V1 + 0, V2 + 1, V1 + 1};
+      break;
+    }
+    case 8: {
+      enum { V1 = 0, V2 = 8 };
+      mMask = {V1 + 4, V2 + 4, V1 + 5, V2 + 5, V1 + 6, V2 + 6, V1 + 7, V2 + 7};
+      rMask = {V2 + 0, V1 + 0, V2 + 1, V1 + 1, V2 + 2, V1 + 2, V2 + 3, V1 + 3};
+      break;
+    }
+    case 16:
+      mMask = {0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B,
+               0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F};
+      rMask = {0x10, 0x00, 0x11, 0x01, 0x12, 0x02, 0x13, 0x03,
+               0x14, 0x04, 0x15, 0x05, 0x16, 0x06, 0x17, 0x07};
+      break;
+    default:
+      llvm_unreachable("unexpected vector length");
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("invalid vector operation for generator");
+  }
+
+  auto vargs{convertVecArgs(builder, loc, vecTyInfo, argBases)};
+
+  llvm::SmallVector<int64_t, 16> &mergeMask =
+      (isBEVecElemOrderOnLE()) ? rMask : mMask;
+
+  auto callOp{builder.create<mlir::vector::ShuffleOp>(loc, vargs[0], vargs[1],
+                                                      mergeMask)};
+  return builder.createConvert(loc, resultType, callOp);
+}
+
 // VEC_NMADD, VEC_MSUB
 template <VecOp vop>
 fir::ExtendedValue
@@ -1112,6 +1284,123 @@ PPCIntrinsicLibrary::genVecNmaddMsub(mlir::Type resultType,
   llvm_unreachable("Invalid vector operation for generator");
 }
 
+// VEC_PERM, VEC_PERMI
+template <VecOp vop>
+fir::ExtendedValue
+PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
+                                llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  auto context{builder.getContext()};
+  auto argBases{getBasesForArgs(args)};
+  auto argTypes{getTypesForArgs(argBases)};
+  auto vecTyInfo{getVecTypeFromFir(argBases[0])};
+  auto mlirTy{vecTyInfo.toMlirVectorType(context)};
+
+  auto vi32Ty{mlir::VectorType::get(4, mlir::IntegerType::get(context, 32))};
+  auto vf64Ty{mlir::VectorType::get(2, mlir::FloatType::getF64(context))};
+
+  auto mArg0{builder.createConvert(loc, mlirTy, argBases[0])};
+  auto mArg1{builder.createConvert(loc, mlirTy, argBases[1])};
+
+  switch (vop) {
+  case VecOp::Perm: {
+    VecTypeInfo maskVecTyInfo{getVecTypeFromFir(argBases[2])};
+    auto mlirMaskTy{maskVecTyInfo.toMlirVectorType(context)};
+    auto mMask{builder.createConvert(loc, mlirMaskTy, argBases[2])};
+
+    if (mlirTy != vi32Ty) {
+      mArg0 =
+          builder.create<mlir::LLVM::BitcastOp>(loc, vi32Ty, mArg0).getResult();
+      mArg1 =
+          builder.create<mlir::LLVM::BitcastOp>(loc, vi32Ty, mArg1).getResult();
+    }
+
+    auto funcOp{builder.addNamedFunction(
+        loc, "llvm.ppc.altivec.vperm",
+        genFuncType<Ty::IntegerVector<4>, Ty::IntegerVector<4>,
+                    Ty::IntegerVector<4>, Ty::IntegerVector<1>>(context,
+                                                                builder))};
+
+    llvm::SmallVector<mlir::Value> newArgs;
+    if (isNativeVecElemOrderOnLE()) {
+      auto i8Ty{mlir::IntegerType::get(context, 8)};
+      auto v8Ty{mlir::VectorType::get(16, i8Ty)};
+      auto negOne{builder.createIntegerConstant(loc, i8Ty, -1)};
+      auto vNegOne{
+          builder.create<mlir::vector::BroadcastOp>(loc, v8Ty, negOne)};
+
+      mMask = builder.create<mlir::arith::XOrIOp>(loc, mMask, vNegOne);
+      newArgs = {mArg1, mArg0, mMask};
+    } else {
+      newArgs = {mArg0, mArg1, mMask};
+    }
+
+    auto res{builder.create<fir::CallOp>(loc, funcOp, newArgs).getResult(0)};
+
+    if (res.getType() != argTypes[0]) {
+      // fir.call llvm.ppc.altivec.vperm returns !fir.vector<i4:32>
+      // convert the result back to the original type
+      res = builder.createConvert(loc, vi32Ty, res);
+      if (mlirTy != vi32Ty)
+        res =
+            builder.create<mlir::LLVM::BitcastOp>(loc, mlirTy, res).getResult();
+    }
+    return builder.createConvert(loc, resultType, res);
+  }
+  case VecOp::Permi: {
+    // arg3 is a constant
+    auto constIntOp{
+        mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[2].getDefiningOp())
+            .getValue()
+            .dyn_cast_or_null<mlir::IntegerAttr>()};
+    assert(constIntOp && "expected integer constant argument");
+    auto constInt{constIntOp.getInt()};
+    // arg1, arg2, and result type share same VecTypeInfo
+    if (vecTyInfo.isFloat()) {
+      mArg0 =
+          builder.create<mlir::LLVM::BitcastOp>(loc, vf64Ty, mArg0).getResult();
+      mArg1 =
+          builder.create<mlir::LLVM::BitcastOp>(loc, vf64Ty, mArg1).getResult();
+    }
+
+    llvm::SmallVector<int64_t, 2> nMask; // native vector element order mask
+    llvm::SmallVector<int64_t, 2> rMask; // non-native vector element order mask
+    enum { V1 = 0, V2 = 2 };
+    switch (constInt) {
+    case 0:
+      nMask = {V1 + 0, V2 + 0};
+      rMask = {V2 + 1, V1 + 1};
+      break;
+    case 1:
+      nMask = {V1 + 0, V2 + 1};
+      rMask = {V2 + 0, V1 + 1};
+      break;
+    case 2:
+      nMask = {V1 + 1, V2 + 0};
+      rMask = {V2 + 1, V1 + 0};
+      break;
+    case 3:
+      nMask = {V1 + 1, V2 + 1};
+      rMask = {V2 + 0, V1 + 0};
+      break;
+    default:
+      llvm_unreachable("unexpected arg3 value for vec_permi");
+    }
+
+    llvm::SmallVector<int64_t, 2> mask =
+        (isBEVecElemOrderOnLE()) ? rMask : nMask;
+    auto res{builder.create<mlir::vector::ShuffleOp>(loc, mArg0, mArg1, mask)};
+    if (res.getType() != mlirTy) {
+      auto cast{builder.create<mlir::LLVM::BitcastOp>(loc, mlirTy, res)};
+      return builder.createConvert(loc, resultType, cast);
+    }
+    return builder.createConvert(loc, resultType, res);
+  }
+  default:
+    llvm_unreachable("invalid vector operation for generator");
+  }
+}
+
 // VEC_SEL
 fir::ExtendedValue
 PPCIntrinsicLibrary::genVecSel(mlir::Type resultType,

diff  --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 302694c2a3ca62..b8085f837add58 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1536,6 +1536,9 @@ bool CheckPPCIntrinsic(const Symbol &generic, const Symbol &specific,
   if (specific.name().ToString().compare(0, 14, "__ppc_vec_ctf_") == 0) {
     return CheckArgumentIsConstantExprInRange(actuals, 1, 0, 31, messages);
   }
+  if (specific.name().ToString().compare(0, 16, "__ppc_vec_permi_") == 0) {
+    return CheckArgumentIsConstantExprInRange(actuals, 2, 0, 3, messages);
+  }
   return false;
 }
 

diff  --git a/flang/module/__ppc_intrinsics.f90 b/flang/module/__ppc_intrinsics.f90
index 80a5d87b37dc8a..4f9c8c9432a622 100644
--- a/flang/module/__ppc_intrinsics.f90
+++ b/flang/module/__ppc_intrinsics.f90
@@ -126,6 +126,14 @@ elemental vector(real(VKIND)) function elem_func_vr##VKIND##vi##VKIND##i(arg1, a
     !dir$ ignore_tkr(k) arg2; \
   end function ;
 
+! integer function f(vector(i), i)
+#define ELEM_FUNC_IVII(VKIND) \
+  elemental integer(VKIND) function elem_func_i##VKIND##vi##VKIND##i(arg1, arg2); \
+    vector(integer(VKIND)), intent(in) :: arg1; \
+    integer(8), intent(in) :: arg2; \
+    !dir$ ignore_tkr(k) arg2; \
+  end function ;
+
 ! vector(r) function f(vector(u), i)
 #define ELEM_FUNC_VRVUI(VKIND) \
   elemental vector(real(VKIND)) function elem_func_vr##VKIND##vu##VKIND##i(arg1, arg2); \
@@ -134,6 +142,14 @@ elemental vector(real(VKIND)) function elem_func_vr##VKIND##vu##VKIND##i(arg1, a
     !dir$ ignore_tkr(k) arg2; \
   end function ;
 
+! real function f(vector(r), i)
+#define ELEM_FUNC_RVRI(VKIND) \
+  elemental real(VKIND) function elem_func_r##VKIND##vr##VKIND##i(arg1, arg2); \
+    vector(real(VKIND)), intent(in) :: arg1; \
+    integer(8), intent(in) :: arg2; \
+    !dir$ ignore_tkr(k) arg2; \
+  end function ;
+
 ! The following macros are specific for the vec_convert(v, mold) intrinsics as
 ! the argument keywords are 
diff erent from the other vector intrinsics.
 !
@@ -167,7 +183,8 @@ pure vector(real(VKIND)) function func_vec_convert_vr##VKIND##vi##vr##VKIND(v, m
   FUNC_VEC_CONVERT_VIVIVI(1) FUNC_VEC_CONVERT_VIVIVI(2) FUNC_VEC_CONVERT_VIVIVI(4) FUNC_VEC_CONVERT_VIVIVI(8)
   FUNC_VEC_CONVERT_VUVIVU(1) FUNC_VEC_CONVERT_VUVIVU(2) FUNC_VEC_CONVERT_VUVIVU(4) FUNC_VEC_CONVERT_VUVIVU(8)
   FUNC_VEC_CONVERT_VRVIVR(4) FUNC_VEC_CONVERT_VRVIVR(8)
-
+  ELEM_FUNC_IVII(1) ELEM_FUNC_IVII(2) ELEM_FUNC_IVII(4) ELEM_FUNC_IVII(8)
+  ELEM_FUNC_RVRI(4) ELEM_FUNC_RVRI(8)
   ELEM_FUNC_VIVIVI(1) ELEM_FUNC_VIVIVI(2) ELEM_FUNC_VIVIVI(4) ELEM_FUNC_VIVIVI(8)
   ELEM_FUNC_VUVIVI(1) ELEM_FUNC_VUVIVI(2) ELEM_FUNC_VUVIVI(4) ELEM_FUNC_VUVIVI(8)
   ELEM_FUNC_VUVUVU(1) ELEM_FUNC_VUVUVU(2) ELEM_FUNC_VUVUVU(4) ELEM_FUNC_VUVUVU(8)
@@ -190,18 +207,20 @@ pure vector(real(VKIND)) function func_vec_convert_vr##VKIND##vi##vr##VKIND(v, m
 #undef FUNC_VEC_CONVERT_VRVIVR
 #undef FUNC_VEC_CONVERT_VUVIVU
 #undef FUNC_VEC_CONVERT_VIVIVI
+#undef ELEM_FUNC_RVRI
 #undef ELEM_FUNC_VRVUI
+#undef ELEM_FUNC_IVII
 #undef ELEM_FUNC_VRVII
-#undef ELEM_FUNC_IVIVI
-#undef ELEM_FUNC_IVUVU
-#undef ELEM_FUNC_VIVIVU_2
-#undef ELEM_FUNC_VUVUVU_2
-#undef ELEM_FUNC_VRVRVU_2
 #undef ELEM_FUNC_IVRVR
+#undef ELEM_FUNC_IVUVU
+#undef ELEM_FUNC_IVIVI
 #undef ELEM_FUNC_VUVRVR
+#undef ELEM_FUNC_VRVRVU_2
 #undef ELEM_FUNC_VRVRVR
-#undef ELEM_FUNC_VIVIVU
 #undef ELEM_FUNC_VUVUVU
+#undef ELEM_FUNC_VUVUVU_2
+#undef ELEM_FUNC_VIVIVU
+#undef ELEM_FUNC_VIVIVU_2
 #undef ELEM_FUNC_VUVIVI
 #undef ELEM_FUNC_VIVIVI
 
@@ -213,25 +232,28 @@ elemental vector(real(VKIND)) function elem_func_vr##VKIND##vr##VKIND##vr##VKIND
   end function ;
 
 ! vector(i) function f(vector(i), vector(i), vector(u))
-#define ELEM_FUNC_VIVIVIVU(VKIND) \
-  elemental vector(integer(VKIND)) function elem_func_vi##VKIND##vi##VKIND##vi##VKIND##vu##VKIND(arg1, arg2, arg3); \
+#define ELEM_FUNC_VIVIVIVU_2(VKIND, UKIND) \
+  elemental vector(integer(VKIND)) function elem_func_vi##VKIND##vi##VKIND##vi##VKIND##vu##UKIND(arg1, arg2, arg3); \
     vector(integer(VKIND)), intent(in) :: arg1, arg2; \
-    vector(unsigned(VKIND)), intent(in) :: arg3; \
+    vector(unsigned(UKIND)), intent(in) :: arg3; \
   end function ;
+#define ELEM_FUNC_VIVIVIVU(VKIND) ELEM_FUNC_VIVIVIVU_2(VKIND, VKIND)
 
 ! vector(u) function f(vector(u), vector(u), vector(u))
-#define ELEM_FUNC_VUVUVUVU(VKIND) \
-  elemental vector(unsigned(VKIND)) function elem_func_vu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND(arg1, arg2, arg3); \
-    vector(unsigned(VKIND)), intent(in) :: arg1, arg2, arg3; \
+#define ELEM_FUNC_VUVUVUVU_2(VKIND, UKIND) \
+  elemental vector(unsigned(VKIND)) function elem_func_vu##VKIND##vu##VKIND##vu##VKIND##vu##UKIND(arg1, arg2, arg3); \
+    vector(unsigned(VKIND)), intent(in) :: arg1, arg2; \
+    vector(unsigned(UKIND)), intent(in) :: arg3; \
   end function ;
-
+#define ELEM_FUNC_VUVUVUVU(VKIND) ELEM_FUNC_VUVUVUVU_2(VKIND, VKIND)
+  
 ! vector(r) function f(vector(r), vector(r), vector(u))
-#define ELEM_FUNC_VRVRVRVU(VKIND) \
-  elemental vector(real(VKIND)) function elem_func_vr##VKIND##vr##VKIND##vr##VKIND##vu##VKIND(arg1, arg2, arg3); \
+#define ELEM_FUNC_VRVRVRVU_2(VKIND, UKIND) \
+  elemental vector(real(VKIND)) function elem_func_vr##VKIND##vr##VKIND##vr##VKIND##vu##UKIND(arg1, arg2, arg3); \
     vector(real(VKIND)), intent(in) :: arg1, arg2; \
-    vector(unsigned(VKIND)), intent(in) :: arg3; \
+    vector(unsigned(UKIND)), intent(in) :: arg3; \
   end function ;
-
+#define ELEM_FUNC_VRVRVRVU(VKIND) ELEM_FUNC_VRVRVRVU_2(VKIND, VKIND)
 
 ! vector(i) function f(vector(i), vector(i), i)
 #define ELEM_FUNC_VIVIVII(VKIND) \
@@ -257,22 +279,49 @@ elemental vector(real(VKIND)) function elem_func_vr##VKIND##vr##VKIND##vr##VKIND
     !dir$ ignore_tkr(k) arg3; \
   end function ;
 
+! vector(i) function f(i, vector(i), i)
+#define ELEM_FUNC_VIIVII(VKIND) \
+  elemental vector(integer(VKIND)) function elem_func_vi##VKIND##i##VKIND##vi##VKIND##i(arg1, arg2, arg3); \
+    integer(VKIND), intent(in) :: arg1; \
+    vector(integer(VKIND)), intent(in) :: arg2; \
+    integer(8), intent(in) :: arg3; \
+    !dir$ ignore_tkr(k) arg3; \
+  end function ;
+
+! vector(r) function f(r, vector(r), i)
+#define ELEM_FUNC_VRRVRI(VKIND) \
+  elemental vector(real(VKIND)) function elem_func_vr##VKIND##r##VKIND##vr##VKIND##i(arg1, arg2, arg3); \
+    real(VKIND), intent(in) :: arg1; \
+    vector(real(VKIND)), intent(in) :: arg2; \
+    integer(8), intent(in) :: arg3; \
+    !dir$ ignore_tkr(k) arg3; \
+  end function ;
+
   ELEM_FUNC_VIVIVIVU(1) ELEM_FUNC_VIVIVIVU(2) ELEM_FUNC_VIVIVIVU(4) ELEM_FUNC_VIVIVIVU(8)
   ELEM_FUNC_VUVUVUVU(1) ELEM_FUNC_VUVUVUVU(2) ELEM_FUNC_VUVUVUVU(4) ELEM_FUNC_VUVUVUVU(8)
   ELEM_FUNC_VRVRVRVU(4) ELEM_FUNC_VRVRVRVU(8)
+  ELEM_FUNC_VIVIVIVU_2(2,1) ELEM_FUNC_VIVIVIVU_2(4,1) ELEM_FUNC_VIVIVIVU_2(8,1)
+  ELEM_FUNC_VUVUVUVU_2(2,1) ELEM_FUNC_VUVUVUVU_2(4,1) ELEM_FUNC_VUVUVUVU_2(8,1)
+  ELEM_FUNC_VRVRVRVU_2(4,1) ELEM_FUNC_VRVRVRVU_2(8,1)
+  ELEM_FUNC_VIIVII(1) ELEM_FUNC_VIIVII(2) ELEM_FUNC_VIIVII(4) ELEM_FUNC_VIIVII(8)
+  ELEM_FUNC_VRRVRI(4) ELEM_FUNC_VRRVRI(8)
   ELEM_FUNC_VRVRVRVR(4) ELEM_FUNC_VRVRVRVR(8)
   ELEM_FUNC_VIVIVII(1) ELEM_FUNC_VIVIVII(2) ELEM_FUNC_VIVIVII(4) ELEM_FUNC_VIVIVII(8)
   ELEM_FUNC_VUVUVUI(1) ELEM_FUNC_VUVUVUI(2) ELEM_FUNC_VUVUVUI(4) ELEM_FUNC_VUVUVUI(8)
   ELEM_FUNC_VRVRVRI(4) ELEM_FUNC_VRVRVRI(8)
 
-#undef ELEM_FUNC_VIVIVII
-#undef ELEM_FUNC_VUVUVUI
+#undef ELEM_FUNC_VRRVRI
+#undef ELEM_FUNC_VIIVII
 #undef ELEM_FUNC_VRVRVRI
-#undef ELEM_FUNC_VRVRVRVR
+#undef ELEM_FUNC_VUVUVUI
+#undef ELEM_FUNC_VIVIVII
 #undef ELEM_FUNC_VRVRVRVU
-#undef ELEM_FUNC_VRVRVRVR
+#undef ELEM_FUNC_VRVRVRVU_2
 #undef ELEM_FUNC_VUVUVUVU
+#undef ELEM_FUNC_VUVUVUVU_2
 #undef ELEM_FUNC_VIVIVIVU
+#undef ELEM_FUNC_VIVIVIVU_2
+#undef ELEM_FUNC_VRVRVRVR
 
   end interface
 
@@ -623,6 +672,28 @@ end function func_r8r8i
   end interface vec_max
   public :: vec_max
 
+! vec_mergeh
+  VEC_VI_VI_VI(vec_mergeh,1) VEC_VI_VI_VI(vec_mergeh,2) VEC_VI_VI_VI(vec_mergeh,4) VEC_VI_VI_VI(vec_mergeh,8)
+  VEC_VU_VU_VU(vec_mergeh,1) VEC_VU_VU_VU(vec_mergeh,2) VEC_VU_VU_VU(vec_mergeh,4) VEC_VU_VU_VU(vec_mergeh,8)
+  VEC_VR_VR_VR(vec_mergeh,4) VEC_VR_VR_VR(vec_mergeh,8)
+  interface vec_mergeh
+    procedure :: VI_VI_VI(vec_mergeh,1), VI_VI_VI(vec_mergeh,2), VI_VI_VI(vec_mergeh,4), VI_VI_VI(vec_mergeh,8)
+    procedure :: VU_VU_VU(vec_mergeh,1), VU_VU_VU(vec_mergeh,2), VU_VU_VU(vec_mergeh,4), VU_VU_VU(vec_mergeh,8)
+    procedure :: VR_VR_VR(vec_mergeh,4), VR_VR_VR(vec_mergeh,8)
+  end interface vec_mergeh
+  public :: vec_mergeh
+
+! vec_mergel
+  VEC_VI_VI_VI(vec_mergel,1) VEC_VI_VI_VI(vec_mergel,2) VEC_VI_VI_VI(vec_mergel,4) VEC_VI_VI_VI(vec_mergel,8)
+  VEC_VU_VU_VU(vec_mergel,1) VEC_VU_VU_VU(vec_mergel,2) VEC_VU_VU_VU(vec_mergel,4) VEC_VU_VU_VU(vec_mergel,8)
+  VEC_VR_VR_VR(vec_mergel,4) VEC_VR_VR_VR(vec_mergel,8)
+  interface vec_mergel
+    procedure :: VI_VI_VI(vec_mergel,1), VI_VI_VI(vec_mergel,2), VI_VI_VI(vec_mergel,4), VI_VI_VI(vec_mergel,8)
+    procedure :: VU_VU_VU(vec_mergel,1), VU_VU_VU(vec_mergel,2), VU_VU_VU(vec_mergel,4), VU_VU_VU(vec_mergel,8)
+    procedure :: VR_VR_VR(vec_mergel,4), VR_VR_VR(vec_mergel,8)
+  end interface vec_mergel
+  public :: vec_mergel
+
 ! vec_min
   VEC_VI_VI_VI(vec_min,1) VEC_VI_VI_VI(vec_min,2) VEC_VI_VI_VI(vec_min,4) VEC_VI_VI_VI(vec_min,8)
   VEC_VU_VU_VU(vec_min,1) VEC_VU_VU_VU(vec_min,2) VEC_VU_VU_VU(vec_min,4) VEC_VU_VU_VU(vec_min,8)
@@ -771,18 +842,34 @@ end function func_r8r8i
 ! vector function(vector, vector, vector)
 !-----------------------------------------
 #define VR_VR_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND
-#define VI_VI_VI_VU(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND##vi##VKIND##vu##VKIND
-#define VU_VU_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND
-#define VR_VR_VR_VU(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND##vr##VKIND##vu##VKIND
+#define VI_VI_VI_VU_2(NAME, VKIND, UKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND##vi##VKIND##vu##UKIND
+#define VI_VI_VI_VU(NAME, VKIND) VI_VI_VI_VU_2(NAME, VKIND, VKIND)
+#define VU_VU_VU_VU_2(NAME, VKIND, UKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND##vu##VKIND##vu##UKIND
+#define VU_VU_VU_VU(NAME, VKIND) VU_VU_VU_VU_2(NAME, VKIND, VKIND)
+#define VR_VR_VR_VU_2(NAME, VKIND, UKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND##vr##VKIND##vu##UKIND
+#define VR_VR_VR_VU(NAME, VKIND) VR_VR_VR_VU_2(NAME, VKIND, VKIND)
+! i0 indicates "!dir$ ignore_tkr(k) arg3"
+#define VI_VI_VI_I(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND##vi##VKIND##i0
+#define VU_VU_VU_I(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND##vu##VKIND##i0
+#define VR_VR_VR_I(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND##vr##VKIND##i0
 
 #define VEC_VR_VR_VR_VR(NAME, VKIND) \
   procedure(elem_func_vr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND) :: VR_VR_VR_VR(NAME, VKIND);
-#define VEC_VI_VI_VI_VU(NAME, VKIND) \
-  procedure(elem_func_vi##VKIND##vi##VKIND##vi##VKIND##vu##VKIND) :: VI_VI_VI_VU(NAME, VKIND);
-#define VEC_VU_VU_VU_VU(NAME, VKIND) \
-  procedure(elem_func_vu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND) :: VU_VU_VU_VU(NAME, VKIND);
-#define VEC_VR_VR_VR_VU(NAME, VKIND) \
-  procedure(elem_func_vr##VKIND##vr##VKIND##vr##VKIND##vu##VKIND) :: VR_VR_VR_VU(NAME, VKIND);
+#define VEC_VI_VI_VI_VU_2(NAME, VKIND, UKIND) \
+  procedure(elem_func_vi##VKIND##vi##VKIND##vi##VKIND##vu##UKIND) :: VI_VI_VI_VU_2(NAME, VKIND, UKIND);
+#define VEC_VI_VI_VI_VU(NAME, VKIND) VEC_VI_VI_VI_VU_2(NAME, VKIND, VKIND)
+#define VEC_VU_VU_VU_VU_2(NAME, VKIND, UKIND) \
+  procedure(elem_func_vu##VKIND##vu##VKIND##vu##VKIND##vu##UKIND) :: VU_VU_VU_VU_2(NAME, VKIND, UKIND);
+#define VEC_VU_VU_VU_VU(NAME, VKIND) VEC_VU_VU_VU_VU_2(NAME, VKIND, VKIND)
+#define VEC_VR_VR_VR_VU_2(NAME, VKIND, UKIND) \
+  procedure(elem_func_vr##VKIND##vr##VKIND##vr##VKIND##vu##UKIND) :: VR_VR_VR_VU_2(NAME, VKIND, UKIND);
+#define VEC_VR_VR_VR_VU(NAME, VKIND) VEC_VR_VR_VR_VU_2(NAME, VKIND, VKIND)
+#define VEC_VI_VI_VI_I(NAME, VKIND) \
+  procedure(elem_func_vi##VKIND##vi##VKIND##vi##VKIND##i) :: VI_VI_VI_I(NAME, VKIND);
+#define VEC_VU_VU_VU_I(NAME, VKIND) \
+  procedure(elem_func_vu##VKIND##vu##VKIND##vu##VKIND##i) :: VU_VU_VU_I(NAME, VKIND);
+#define VEC_VR_VR_VR_I(NAME, VKIND) \
+  procedure(elem_func_vr##VKIND##vr##VKIND##vr##VKIND##i) :: VR_VR_VR_I(NAME, VKIND);
 
 ! vec_madd
   VEC_VR_VR_VR_VR(vec_madd,4) VEC_VR_VR_VR_VR(vec_madd,8)
@@ -812,6 +899,28 @@ end function func_r8r8i
   end interface vec_nmsub
   public :: vec_nmsub
 
+! vec_perm
+  VEC_VI_VI_VI_VU_2(vec_perm,1,1) VEC_VI_VI_VI_VU_2(vec_perm,2,1) VEC_VI_VI_VI_VU_2(vec_perm,4,1) VEC_VI_VI_VI_VU_2(vec_perm,8,1)
+  VEC_VU_VU_VU_VU_2(vec_perm,1,1) VEC_VU_VU_VU_VU_2(vec_perm,2,1) VEC_VU_VU_VU_VU_2(vec_perm,4,1) VEC_VU_VU_VU_VU_2(vec_perm,8,1)
+  VEC_VR_VR_VR_VU_2(vec_perm,4,1) VEC_VR_VR_VR_VU_2(vec_perm,8,1)
+  interface vec_perm
+     procedure :: VI_VI_VI_VU_2(vec_perm,1,1), VI_VI_VI_VU_2(vec_perm,2,1), VI_VI_VI_VU_2(vec_perm,4,1), VI_VI_VI_VU_2(vec_perm,8,1)
+     procedure :: VU_VU_VU_VU_2(vec_perm,1,1), VU_VU_VU_VU_2(vec_perm,2,1), VU_VU_VU_VU_2(vec_perm,4,1), VU_VU_VU_VU_2(vec_perm,8,1)
+     procedure :: VR_VR_VR_VU_2(vec_perm,4,1), VR_VR_VR_VU_2(vec_perm,8,1)
+  end interface vec_perm
+  public :: vec_perm
+
+! vec_permi
+  VEC_VI_VI_VI_I(vec_permi,8)
+  VEC_VU_VU_VU_I(vec_permi,8)
+  VEC_VR_VR_VR_I(vec_permi,4) VEC_VR_VR_VR_I(vec_permi,8)
+  interface vec_permi
+     procedure :: VI_VI_VI_I(vec_permi,8)
+     procedure :: VU_VU_VU_I(vec_permi,8)
+     procedure :: VR_VR_VR_I(vec_permi,4), VR_VR_VR_I(vec_permi,8)
+  end interface vec_permi
+  public :: vec_permi
+
 ! vec_sel
   VEC_VI_VI_VI_VU(vec_sel,1) VEC_VI_VI_VI_VU(vec_sel,2) VEC_VI_VI_VI_VU(vec_sel,4) VEC_VI_VI_VI_VU(vec_sel,8)
   VEC_VU_VU_VU_VU(vec_sel,1) VEC_VU_VU_VU_VU(vec_sel,2) VEC_VU_VU_VU_VU(vec_sel,4) VEC_VU_VU_VU_VU(vec_sel,8)
@@ -823,15 +932,53 @@ end function func_r8r8i
   end interface vec_sel
   public :: vec_sel
 
+#undef VEC_VR_VR_VR_I
+#undef VEC_VU_VU_VU_I
+#undef VEC_VI_VI_VI_I
+#undef VEC_VI_VI_VI_VU_2
 #undef VEC_VI_VI_VI_VU
+#undef VEC_VU_VU_VU_VU_2
 #undef VEC_VU_VU_VU_VU
+#undef VEC_VR_VR_VR_VU_2
 #undef VEC_VR_VR_VR_VU
 #undef VEC_VR_VR_VR_VR
+#undef VR_VR_VR_I
+#undef VU_VU_VU_I
+#undef VI_VI_VI_I
 #undef VI_VI_VI_VU
+#undef VI_VI_VI_VU_2
 #undef VU_VU_VU_VU
+#undef VU_VU_VU_VU_2
 #undef VR_VR_VR_VU
+#undef VR_VR_VR_VU_2
 #undef VR_VR_VR_VR
 
+!------------------------------------------
+! vector function(integer, vector, integer)
+! vector function(real, vector, integer)
+!------------------------------------------
+#define VI_I_VI_I(NAME, VKIND) __ppc_##NAME##_vi##VKIND##i##VKIND##vi##VKIND##i0
+#define VR_R_VR_I(NAME, VKIND) __ppc_##NAME##_vr##VKIND##r##VKIND##vr##VKIND##i0
+
+#define VEC_VI_I_VI_I(NAME, VKIND) \
+  procedure(elem_func_vi##VKIND##i##VKIND##vi##VKIND##i) :: VI_I_VI_I(NAME, VKIND);
+#define VEC_VR_R_VR_I(NAME, VKIND) \
+  procedure(elem_func_vr##VKIND##r##VKIND##vr##VKIND##i) :: VR_R_VR_I(NAME, VKIND);
+
+! vec_insert
+  VEC_VI_I_VI_I(vec_insert,1) VEC_VI_I_VI_I(vec_insert,2) VEC_VI_I_VI_I(vec_insert,4) VEC_VI_I_VI_I(vec_insert,8)
+  VEC_VR_R_VR_I(vec_insert,4) VEC_VR_R_VR_I(vec_insert,8)
+  interface vec_insert
+     procedure :: VI_I_VI_I(vec_insert,1), VI_I_VI_I(vec_insert,2), VI_I_VI_I(vec_insert,4), VI_I_VI_I(vec_insert,8)
+     procedure :: VR_R_VR_I(vec_insert,4), VR_R_VR_I(vec_insert,8)
+  end interface vec_insert
+  public :: vec_insert
+
+#undef VEC_VR_R_VR_I
+#undef VEC_VI_I_VI_I
+#undef VR_R_VR_I
+#undef VI_I_VI_I
+
 !----------------------------------
 ! integer function(vector, vector)
 !----------------------------------
@@ -864,6 +1011,26 @@ end function func_r8r8i
 #undef I_VU_VU
 #undef I_VI_VI
 
+!----------------------------------------
+! integer/real function(vector, integer)
+!----------------------------------------
+#define I_VI_I(NAME, VKIND) __ppc_##NAME##_i##VKIND##vi##VKIND##i0
+#define R_VR_I(NAME, VKIND) __ppc_##NAME##_r##VKIND##vr##VKIND##i0
+
+#define VEC_I_VI_I(NAME, VKIND) \
+  procedure(elem_func_i##VKIND##vi##VKIND##i) :: I_VI_I(NAME, VKIND);
+#define VEC_R_VR_I(NAME, VKIND) \
+  procedure(elem_func_r##VKIND##vr##VKIND##i) :: R_VR_I(NAME, VKIND);
+
+! vec_extract
+  VEC_I_VI_I(vec_extract,1) VEC_I_VI_I(vec_extract,2) VEC_I_VI_I(vec_extract,4) VEC_I_VI_I(vec_extract,8)
+  VEC_R_VR_I(vec_extract,4) VEC_R_VR_I(vec_extract,8)
+  interface vec_extract
+     procedure :: I_VI_I(vec_extract,1), I_VI_I(vec_extract,2), I_VI_I(vec_extract,4), I_VI_I(vec_extract,8)
+     procedure :: R_VR_I(vec_extract,4), R_VR_I(vec_extract,8)
+  end interface
+  public :: vec_extract
+
 !------------------------------------------
 ! vector function(vector, vector, integer)
 !------------------------------------------

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90
new file mode 100644
index 00000000000000..2288999fd9589a
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90
@@ -0,0 +1,53 @@
+! RUN: bbc -emit-fir -fno-ppc-native-vector-element-order %s -o - | FileCheck --check-prefixes="FIR" %s
+! RUN: %flang_fc1 -emit-llvm -fno-ppc-native-vector-element-order %s -o - | FileCheck --check-prefixes="LLVMIR" %s
+! REQUIRES: target=powerpc{{.*}}
+
+!CHECK-LABEL: vec_extract_testr4i8
+subroutine vec_extract_testr4i8(arg1, arg2, r)
+  vector(real(4)) :: arg1
+  real(4) :: r
+  integer(8) :: arg2
+  r = vec_extract(arg1, arg2)
+
+! FIR: %[[arg1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! FIR: %[[arg2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! FIR: %[[varg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! FIR: %[[c:.*]] = arith.constant 4 : i64
+! FIR: %[[urem:.*]] = llvm.urem %[[arg2]], %[[c]] : i64
+! FIR: %[[c2:.*]] = arith.constant 3 : i64
+! FIR: %[[sub:.*]] = llvm.sub %[[c2]], %[[urem]] : i64
+! FIR: %[[ext:.*]] = vector.extractelement %[[varg1]][%[[sub]] : i64] : vector<4xf32>
+! FIR: fir.store %[[ext]] to %arg2 : !fir.ref<f32>
+
+! LLVMIR: %[[arg1:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! LLVMIR: %[[arg2:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! LLVMIR: %[[urem:.*]] = urem i64 %[[arg2]], 4
+! LLVMIR: %[[sub:.*]] = sub i64 3, %[[urem]]
+! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[arg1]], i64 %[[sub]]
+! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4
+end subroutine vec_extract_testr4i8
+
+!CHECK-LABEL: vec_extract_testi8i1
+subroutine vec_extract_testi8i1(arg1, arg2, r)
+  vector(integer(8)) :: arg1
+  integer(8) :: r
+  integer(1) :: arg2
+  r = vec_extract(arg1, arg2)
+
+! FIR: %[[arg1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! FIR: %[[arg2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! FIR: %[[varg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! FIR: %[[c:.*]] = arith.constant 2 : i8
+! FIR: %[[urem:.*]] = llvm.urem %[[arg2]], %[[c]]  : i8
+! FIR: %[[c2:.*]] = arith.constant 1 : i8
+! FIR: %[[sub:.*]] = llvm.sub %[[c2]], %[[urem]] : i8
+! FIR: %[[ext:.*]] = vector.extractelement %[[varg1]][%[[sub]] : i8] : vector<2xi64>
+! FIR: fir.store %[[ext]] to %arg2 : !fir.ref<i64>
+
+! LLVMIR: %[[arg1:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! LLVMIR: %[[arg2:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! LLVMIR: %[[urem:.*]] = urem i8 %[[arg2]], 2
+! LLVMIR: %[[sub:.*]] = sub i8 1, %[[urem]]
+! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[arg1]], i8 %[[sub]]
+! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8
+end subroutine vec_extract_testi8i1

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-extract.f90 b/flang/test/Lower/PowerPC/ppc-vec-extract.f90
new file mode 100644
index 00000000000000..78405d5a535f73
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-extract.f90
@@ -0,0 +1,589 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck --check-prefixes="CHECK-FIR" %s
+! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --fir-to-llvm-ir | FileCheck --check-prefixes="CHECK-LLVMIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+!-------------
+! vec_extract
+!-------------
+! CHECK-LABEL: vec_extract_testf32
+subroutine vec_extract_testf32(x, i1, i2, i4, i8)
+  vector(real(4)) :: x
+  real(4) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_extract(x, i1)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i8
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i8] : vector<4xf32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i8) : i8
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i8] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f32>
+
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[u:.*]] = urem i8 %[[i1]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x float> %[[x]], i8 %[[u]]
+! CHECK: store float %[[r]], ptr %{{[0-9]}}, align 4
+
+  r = vec_extract(x, i2)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i16
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i16] : vector<4xf32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i16) : i16
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i16] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f32>
+
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[u:.*]] = urem i16 %[[i2]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x float> %[[x]], i16 %[[u]]
+! CHECK: store float %[[r]], ptr %{{[0-9]}}, align 4
+
+  r = vec_extract(x, i4)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i32
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i32] : vector<4xf32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i32) : i32
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i32] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f32>
+
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[u:.*]] = urem i32 %[[i4]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x float> %[[x]], i32 %[[u]]
+! CHECK: store float %[[r]], ptr %{{[0-9]}}, align 4
+
+  r = vec_extract(x, i8)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i64
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i64] : vector<4xf32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i64) : i64
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i64] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f32>
+
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[u:.*]] = urem i64 %[[i8]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[u]]
+! CHECK: store float %[[r]], ptr %{{[0-9]}}, align 4
+end subroutine vec_extract_testf32
+
+! CHECK-LABEL: vec_extract_testf64
+subroutine vec_extract_testf64(x, i1, i2, i4, i8)
+  vector(real(8)) :: x
+  real(8) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_extract(x, i1)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i8
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i8] : vector<2xf64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i8) : i8
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i8] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f64>
+
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[u:.*]] = urem i8 %[[i1]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x double> %[[x]], i8 %[[u]]
+! CHECK: store double %[[r]], ptr %{{[0-9]}}, align 8
+
+  r = vec_extract(x, i2)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i16
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i16] : vector<2xf64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i16) : i16
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i16] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f64>
+
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[u:.*]] = urem i16 %[[i2]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x double> %[[x]], i16 %[[u]]
+! CHECK: store double %[[r]], ptr %{{[0-9]}}, align 8
+
+  r = vec_extract(x, i4)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i32
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i32] : vector<2xf64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i32) : i32
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i32] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f64>
+
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[u:.*]] = urem i32 %[[i4]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x double> %[[x]], i32 %[[u]]
+! CHECK: store double %[[r]], ptr %{{[0-9]}}, align 8
+
+  r = vec_extract(x, i8)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i64
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i64] : vector<2xf64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<f64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i64) : i64
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i64] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<f64>
+
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[u:.*]] = urem i64 %[[i8]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[u]]
+! CHECK: store double %[[r]], ptr %{{[0-9]}}, align 8
+end subroutine vec_extract_testf64
+
+! CHECK-LABEL: vec_extract_testi8
+subroutine vec_extract_testi8(x, i1, i2, i4, i8)
+  vector(integer(1)) :: x
+  integer(1) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_extract(x, i1)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i8
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i8] : vector<16xi8>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i8>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i8) : i8
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i8] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i8>
+
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[u:.*]] = urem i8 %[[i1]], 16
+! CHECK: %[[r:.*]] = extractelement <16 x i8> %[[x]], i8 %[[u]]
+! CHECK: store i8 %[[r]], ptr %{{[0-9]}}, align 1
+
+  r = vec_extract(x, i2)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i16
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i16] : vector<16xi8>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i8>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i16) : i16
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i16] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i8>
+
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[u:.*]] = urem i16 %[[i2]], 16
+! CHECK: %[[r:.*]] = extractelement <16 x i8> %[[x]], i16 %[[u]]
+! CHECK: store i8 %[[r]], ptr %{{[0-9]}}, align 1
+
+  r = vec_extract(x, i4)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i32
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i32] : vector<16xi8>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i8>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i32) : i32
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i32] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i8>
+
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[u:.*]] = urem i32 %[[i4]], 16
+! CHECK: %[[r:.*]] = extractelement <16 x i8> %[[x]], i32 %[[u]]
+! CHECK: store i8 %[[r]], ptr %{{[0-9]}}, align 1
+
+  r = vec_extract(x, i8)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i64
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i64] : vector<16xi8>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i8>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i64) : i64
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i64] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i8>
+
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[u:.*]] = urem i64 %[[i8]], 16
+! CHECK: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[u]]
+! CHECK: store i8 %[[r]], ptr %{{[0-9]}}, align 1
+end subroutine vec_extract_testi8
+
+! CHECK-LABEL: vec_extract_testi16
+subroutine vec_extract_testi16(x, i1, i2, i4, i8)
+  vector(integer(2)) :: x
+  integer(2) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_extract(x, i1)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i8
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i8] : vector<8xi16>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i16>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i8) : i8
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i8] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i16>
+
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[u:.*]] = urem i8 %[[i1]], 8
+! CHECK: %[[r:.*]] = extractelement <8 x i16> %[[x]], i8 %[[u]]
+! CHECK: store i16 %[[r]], ptr %{{[0-9]}}, align 2
+
+  r = vec_extract(x, i2)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i16
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i16] : vector<8xi16>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i16>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i16) : i16
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i16] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i16>
+
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[u:.*]] = urem i16 %[[i2]], 8
+! CHECK: %[[r:.*]] = extractelement <8 x i16> %[[x]], i16 %[[u]]
+! CHECK: store i16 %[[r]], ptr %{{[0-9]}}, align 2
+
+  r = vec_extract(x, i4)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i32
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i32] : vector<8xi16>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i16>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i32) : i32
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i32] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i16>
+
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[u:.*]] = urem i32 %[[i4]], 8
+! CHECK: %[[r:.*]] = extractelement <8 x i16> %[[x]], i32 %[[u]]
+! CHECK: store i16 %[[r]], ptr %{{[0-9]}}, align 2
+
+  r = vec_extract(x, i8)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i64
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i64] : vector<8xi16>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i16>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i64) : i64
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i64] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i16>
+
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[u:.*]] = urem i64 %[[i8]], 8
+! CHECK: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[u]]
+! CHECK: store i16 %[[r]], ptr %{{[0-9]}}, align 2
+end subroutine vec_extract_testi16
+
+! CHECK-LABEL: vec_extract_testi32
+subroutine vec_extract_testi32(x, i1, i2, i4, i8)
+  vector(integer(4)) :: x
+  integer(4) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_extract(x, i1)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i8
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i8] : vector<4xi32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i8) : i8
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i8] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i32>
+
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[u:.*]] = urem i8 %[[i1]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x i32> %[[x]], i8 %[[u]]
+! CHECK: store i32 %[[r]], ptr %{{[0-9]}}, align 4
+
+  r = vec_extract(x, i2)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i16
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i16] : vector<4xi32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i16) : i16
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i16] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i32>
+
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[u:.*]] = urem i16 %[[i2]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x i32> %[[x]], i16 %[[u]]
+! CHECK: store i32 %[[r]], ptr %{{[0-9]}}, align 4
+
+  r = vec_extract(x, i4)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i32
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i32] : vector<4xi32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i32) : i32
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i32] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i32>
+
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[u:.*]] = urem i32 %[[i4]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x i32> %[[x]], i32 %[[u]]
+! CHECK: store i32 %[[r]], ptr %{{[0-9]}}, align 4
+
+  r = vec_extract(x, i8)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i64
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i64] : vector<4xi32>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i32>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i64) : i64
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i64] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i32>
+
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[u:.*]] = urem i64 %[[i8]], 4
+! CHECK: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[u]]
+! CHECK: store i32 %[[r]], ptr %{{[0-9]}}, align 4
+end subroutine vec_extract_testi32
+
+! CHECK-LABEL: vec_extract_testi64
+subroutine vec_extract_testi64(x, i1, i2, i4, i8)
+  vector(integer(8)) :: x
+  integer(8) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_extract(x, i1)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i8
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i8] : vector<2xi64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i8) : i8
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i1]], %[[c]]  : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i8] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i64>
+
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[u:.*]] = urem i8 %[[i1]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x i64> %[[x]], i8 %[[u]]
+! CHECK: store i64 %[[r]], ptr %{{[0-9]}}, align 8
+
+  r = vec_extract(x, i2)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i16
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i16] : vector<2xi64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i16) : i16
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i2]], %[[c]]  : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i16] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i64>
+
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[u:.*]] = urem i16 %[[i2]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x i64> %[[x]], i16 %[[u]]
+! CHECK: store i64 %[[r]], ptr %{{[0-9]}}, align 8
+
+  r = vec_extract(x, i4)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i32
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i32] : vector<2xi64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i32) : i32
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i4]], %[[c]]  : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i32] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i64>
+
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[u:.*]] = urem i32 %[[i4]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x i64> %[[x]], i32 %[[u]]
+! CHECK: store i64 %[[r]], ptr %{{[0-9]}}, align 8
+
+  r = vec_extract(x, i8)
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i64
+! CHECK-FIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-FIR: %[[r:.*]] = vector.extractelement %[[vr]][%[[u]] : i64] : vector<2xi64>
+! CHECK-FIR: fir.store %[[r]] to %{{[0-9]}} : !fir.ref<i64>
+
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i64) : i64
+! CHECK-LLVMIR: %[[u:.*]] = llvm.urem %[[i8]], %[[c]]  : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.extractelement %[[x]][%[[u]] : i64] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<i64>
+
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[u:.*]] = urem i64 %[[i8]], 2
+! CHECK: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[u]]
+! CHECK: store i64 %[[r]], ptr %{{[0-9]}}, align 8
+end subroutine vec_extract_testi64

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90
new file mode 100644
index 00000000000000..d6d8ae503d3254
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90
@@ -0,0 +1,61 @@
+! RUN: bbc -emit-fir -fno-ppc-native-vector-element-order %s -o - | FileCheck --check-prefixes="FIR" %s
+! RUN: %flang_fc1 -emit-llvm -fno-ppc-native-vector-element-order %s -o - | FileCheck --check-prefixes="LLVMIR" %s
+! REQUIRES: target=powerpc{{.*}}
+
+!CHECK-LABEL: vec_insert_testf32i64
+subroutine vec_insert_testf32i64(v, x, i8)
+  real(4) :: v
+  vector(real(4)) :: x
+  vector(real(4)) :: r
+  integer(8) :: i8
+  r = vec_insert(v, x, i8)
+
+! FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f32>
+! FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! FIR: %[[c:.*]] = arith.constant 4 : i64
+! FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! FIR: %[[c3:.*]] = arith.constant 3 : i64
+! FIR: %[[sub:.*]] = llvm.sub %[[c3]], %[[urem]] : i64
+! FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[sub]] : i64] : vector<4xf32>
+! FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+
+! LLVMIR: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4
+! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 4
+! LLVMIR: %[[sub:.*]] = sub i64 3, %[[urem]]
+! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[sub]]
+! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testf32i64
+
+!CHECK-LABEL: vec_insert_testi64i8
+subroutine vec_insert_testi64i8(v, x, i1, i2, i4, i8)
+  integer(8) :: v
+  vector(integer(8)) :: x
+  vector(integer(8)) :: r
+  integer(1) :: i1
+  r = vec_insert(v, x, i1)
+
+! FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! FIR: %[[c:.*]] = arith.constant 2 : i8
+! FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! FIR: %[[c1:.*]] = arith.constant 1 : i8
+! FIR: %[[sub:.*]] = llvm.sub %[[c1]], %[[urem]] : i8
+! FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[sub]] : i8] : vector<2xi64>
+! FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+
+! LLVMIR: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2
+! LLVMIR: %[[sub:.*]] = sub i8 1, %[[urem]]
+! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[sub]]
+! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testi64i8

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-insert.f90 b/flang/test/Lower/PowerPC/ppc-vec-insert.f90
new file mode 100644
index 00000000000000..dac186771e67a1
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-insert.f90
@@ -0,0 +1,697 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck --check-prefixes="CHECK-FIR" %s
+! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --fir-to-llvm-ir | FileCheck --check-prefixes="CHECK-LLVMIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+! vec_insert
+
+!CHECK-LABEL: vec_insert_testf32
+subroutine vec_insert_testf32(v, x, i1, i2, i4, i8)
+  real(4) :: v
+  vector(real(4)) :: x
+  vector(real(4)) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_insert(v, x, i1)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i8
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i8] : vector<4xf32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i8) : i8
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i8] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[urem:.*]] = urem i8 %[[i1]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i8 %[[urem]]
+! CHECK: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
+
+
+  r = vec_insert(v, x, i2)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i16
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i16] : vector<4xf32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i16) : i16
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i16] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[urem:.*]] = urem i16 %[[i2]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i16 %[[urem]]
+! CHECK: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i4)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i32
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i32] : vector<4xf32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i32) : i32
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i32] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[urem:.*]] = urem i32 %[[i4]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i32 %[[urem]]
+! CHECK: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i8)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i64
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i64] : vector<4xf32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i64) : i64
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i64] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[urem:.*]] = urem i64 %[[i8]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[urem]]
+! CHECK: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testf32
+
+!CHECK-LABEL: vec_insert_testf64
+subroutine vec_insert_testf64(v, x, i1, i2, i4, i8)
+  real(8) :: v
+  vector(real(8)) :: x
+  vector(real(8)) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_insert(v, x, i1)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i8
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i8] : vector<2xf64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i8) : i8
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i8] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[v:.*]] = load double, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[urem:.*]] = urem i8 %[[i1]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i8 %[[urem]]
+! CHECK: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
+
+
+  r = vec_insert(v, x, i2)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i16
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i16] : vector<2xf64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i16) : i16
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i16] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[v:.*]] = load double, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[urem:.*]] = urem i16 %[[i2]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i16 %[[urem]]
+! CHECK: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i4)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i32
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i32] : vector<2xf64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i32) : i32
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i32] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[v:.*]] = load double, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[urem:.*]] = urem i32 %[[i4]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i32 %[[urem]]
+! CHECK: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i8)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<f64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i64
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i64] : vector<2xf64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<f64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i64) : i64
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i64] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[v:.*]] = load double, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[urem:.*]] = urem i64 %[[i8]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[urem]]
+! CHECK: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testf64
+
+!CHECK-LABEL: vec_insert_testi8
+subroutine vec_insert_testi8(v, x, i1, i2, i4, i8)
+  integer(1) :: v
+  vector(integer(1)) :: x
+  vector(integer(1)) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_insert(v, x, i1)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i8
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i8] : vector<16xi8>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i8) : i8
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i8] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[v:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[urem:.*]] = urem i8 %[[i1]], 16
+! CHECK: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i8 %[[urem]]
+! CHECK: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
+
+
+  r = vec_insert(v, x, i2)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i16
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i16] : vector<16xi8>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i16) : i16
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i16] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[v:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[urem:.*]] = urem i16 %[[i2]], 16
+! CHECK: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i16 %[[urem]]
+! CHECK: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i4)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i32
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i32] : vector<16xi8>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i32) : i32
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i32] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[v:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[urem:.*]] = urem i32 %[[i4]], 16
+! CHECK: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i32 %[[urem]]
+! CHECK: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i8)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[c:.*]] = arith.constant 16 : i64
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i64] : vector<16xi8>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(16 : i64) : i64
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i64] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[v:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[urem:.*]] = urem i64 %[[i8]], 16
+! CHECK: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[urem]]
+! CHECK: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testi8
+
+!CHECK-LABEL: vec_insert_testi16
+subroutine vec_insert_testi16(v, x, i1, i2, i4, i8)
+  integer(2) :: v
+  vector(integer(2)) :: x
+  vector(integer(2)) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_insert(v, x, i1)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i8
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i8] : vector<8xi16>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i8) : i8
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i8] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[v:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[urem:.*]] = urem i8 %[[i1]], 8
+! CHECK: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i8 %[[urem]]
+! CHECK: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
+
+
+  r = vec_insert(v, x, i2)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i16
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i16] : vector<8xi16>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i16) : i16
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i16] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[v:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[urem:.*]] = urem i16 %[[i2]], 8
+! CHECK: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i16 %[[urem]]
+! CHECK: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i4)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i32
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i32] : vector<8xi16>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i32) : i32
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i32] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[v:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[urem:.*]] = urem i32 %[[i4]], 8
+! CHECK: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i32 %[[urem]]
+! CHECK: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i8)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[c:.*]] = arith.constant 8 : i64
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i64] : vector<8xi16>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(8 : i64) : i64
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i64] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[v:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[urem:.*]] = urem i64 %[[i8]], 8
+! CHECK: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[urem]]
+! CHECK: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testi16
+
+!CHECK-LABEL: vec_insert_testi32
+subroutine vec_insert_testi32(v, x, i1, i2, i4, i8)
+  integer(4) :: v
+  vector(integer(4)) :: x
+  vector(integer(4)) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_insert(v, x, i1)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i8
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i8] : vector<4xi32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i8) : i8
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i8] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[v:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[urem:.*]] = urem i8 %[[i1]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i8 %[[urem]]
+! CHECK: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
+
+
+  r = vec_insert(v, x, i2)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i16
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i16] : vector<4xi32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i16) : i16
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i16] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[v:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[urem:.*]] = urem i16 %[[i2]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i16 %[[urem]]
+! CHECK: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i4)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i32
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i32] : vector<4xi32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i32) : i32
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i32] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[v:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[urem:.*]] = urem i32 %[[i4]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i32 %[[urem]]
+! CHECK: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i8)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[c:.*]] = arith.constant 4 : i64
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i64] : vector<4xi32>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(4 : i64) : i64
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i64] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[v:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[urem:.*]] = urem i64 %[[i8]], 4
+! CHECK: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[urem]]
+! CHECK: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testi32
+
+
+!CHECK-LABEL: vec_insert_testi64
+subroutine vec_insert_testi64(v, x, i1, i2, i4, i8)
+  integer(8) :: v
+  vector(integer(8)) :: x
+  vector(integer(8)) :: r
+  integer(1) :: i1
+  integer(2) :: i2
+  integer(4) :: i4
+  integer(8) :: i8
+  r = vec_insert(v, x, i1)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i1:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i8>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i8
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i8] : vector<2xi64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i1:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i8>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i8) : i8
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i1]], %[[c]] : i8
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i8] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
+! CHECK: %[[urem:.*]] = urem i8 %[[i1]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[urem]]
+! CHECK: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
+
+
+  r = vec_insert(v, x, i2)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i2:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i16>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i16
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i16] : vector<2xi64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i2:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i16>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i16) : i16
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i2]], %[[c]] : i16
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i16] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
+! CHECK: %[[urem:.*]] = urem i16 %[[i2]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i16 %[[urem]]
+! CHECK: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i4)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i4:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i32>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i32
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i32] : vector<2xi64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i4:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i32>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i32) : i32
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i4]], %[[c]] : i32
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i32] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
+! CHECK: %[[urem:.*]] = urem i32 %[[i4]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i32 %[[urem]]
+! CHECK: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
+
+  r = vec_insert(v, x, i8)
+! CHECK-FIR: %[[v:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[x:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[i8:.*]] = fir.load %arg{{[0-9]}} : !fir.ref<i64>
+! CHECK-FIR: %[[vr:.*]] = fir.convert %[[x]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[c:.*]] = arith.constant 2 : i64
+! CHECK-FIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-FIR: %[[r:.*]] = vector.insertelement %[[v]], %[[vr]][%[[urem]] : i64] : vector<2xi64>
+! CHECK-FIR: %[[r_conv:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[r_conv]] to %{{[0-9]}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[v:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[x:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[i8:.*]] = llvm.load %arg{{[0-9]}} : !llvm.ptr<i64>
+! CHECK-LLVMIR: %[[c:.*]] = llvm.mlir.constant(2 : i64) : i64
+! CHECK-LLVMIR: %[[urem:.*]] = llvm.urem %[[i8]], %[[c]] : i64
+! CHECK-LLVMIR: %[[r:.*]] = llvm.insertelement %[[v]], %[[x]][%[[urem]] : i64] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{[0-9]}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
+! CHECK: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
+! CHECK: %[[urem:.*]] = urem i64 %[[i8]], 2
+! CHECK: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[urem]]
+! CHECK: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
+end subroutine vec_insert_testi64

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-merge-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-merge-elem-order.f90
new file mode 100644
index 00000000000000..41b2c2bbddfa66
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-merge-elem-order.f90
@@ -0,0 +1,49 @@
+! RUN: bbc -emit-fir %s -fno-ppc-native-vector-element-order -o - | FileCheck --check-prefixes="FIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -fno-ppc-native-vector-element-order -o - | FileCheck --check-prefixes="LLVMIR" %s
+! REQUIRES: target=powerpc{{.*}}
+
+!-----------------
+! vec_mergeh
+!-----------------
+
+! CHECK-LABEL: vec_mergeh_test_i4
+subroutine vec_mergeh_test_i4(arg1, arg2)
+  vector(integer(4)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [6, 2, 7, 3] : vector<4xi32>, vector<4xi32>
+! FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+
+! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! LLVMIR: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! LLVMIR: %[[r:.*]] = shufflevector <4 x i32> %[[arg1]], <4 x i32> %[[arg2]], <4 x i32> <i32 6, i32 2, i32 7, i32 3>
+! LLVMIR: store <4 x i32> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_i4
+
+!-----------------
+! vec_mergel
+!-----------------
+
+! CHECK-LABEL: vec_mergel_test_r8
+subroutine vec_mergel_test_r8(arg1, arg2)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [2, 0] : vector<2xf64>, vector<2xf64>
+! FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! LLVMIR: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! LLVMIR: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! LLVMIR: %[[r:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 2, i32 0>
+! LLVMIR: store <2 x double> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_r8

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-merge.f90 b/flang/test/Lower/PowerPC/ppc-vec-merge.f90
new file mode 100644
index 00000000000000..1eaf1750b2252e
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-merge.f90
@@ -0,0 +1,492 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck --check-prefixes="CHECK-FIR" %s
+! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --fir-to-llvm-ir | FileCheck --check-prefixes="CHECK-LLVMIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+!------------
+! vec_mergeh
+!------------
+
+  ! CHECK-LABEL: vec_mergeh_test_i1
+subroutine vec_mergeh_test_i1(arg1, arg2)
+  vector(integer(1)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23] : vector<16xi8>, vector<16xi8>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <16 x i8> %[[arg1]], <16 x i8> %[[arg2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+! CHECK: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_i1
+
+! CHECK-LABEL: vec_mergeh_test_i2
+subroutine vec_mergeh_test_i2(arg1, arg2)
+  vector(integer(2)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 8, 1, 9, 2, 10, 3, 11] : vector<8xi16>, vector<8xi16>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 8, 1, 9, 2, 10, 3, 11] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[arg1:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <8 x i16> %[[arg1]], <8 x i16> %[[arg2]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+! CHECK: store <8 x i16> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_i2
+
+! CHECK-LABEL: vec_mergeh_test_i4
+subroutine vec_mergeh_test_i4(arg1, arg2)
+  vector(integer(4)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 4, 1, 5] : vector<4xi32>, vector<4xi32>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 4, 1, 5] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <4 x i32> %[[arg1]], <4 x i32> %[[arg2]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+! CHECK: store <4 x i32> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_i4
+
+! CHECK-LABEL: vec_mergeh_test_i8
+subroutine vec_mergeh_test_i8(arg1, arg2)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 2] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 2] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: store <2 x i64> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_i8
+
+! CHECK-LABEL: vec_mergeh_test_u1
+subroutine vec_mergeh_test_u1(arg1, arg2)
+  vector(unsigned(1)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23] : vector<16xi8>, vector<16xi8>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:ui8>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <16 x i8> %[[arg1]], <16 x i8> %[[arg2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+! CHECK: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_u1
+
+! CHECK-LABEL: vec_mergeh_test_u2
+subroutine vec_mergeh_test_u2(arg1, arg2)
+  vector(unsigned(2)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<8:ui16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<8:ui16>) -> vector<8xi16>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 8, 1, 9, 2, 10, 3, 11] : vector<8xi16>, vector<8xi16>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:ui16>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 8, 1, 9, 2, 10, 3, 11] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[arg1:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <8 x i16> %[[arg1]], <8 x i16> %[[arg2]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+! CHECK: store <8 x i16> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_u2
+
+! CHECK-LABEL: vec_mergeh_test_u4
+subroutine vec_mergeh_test_u4(arg1, arg2)
+  vector(unsigned(4)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:ui32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:ui32>) -> vector<4xi32>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 4, 1, 5] : vector<4xi32>, vector<4xi32>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:ui32>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 4, 1, 5] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <4 x i32> %[[arg1]], <4 x i32> %[[arg2]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+! CHECK: store <4 x i32> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_u4
+
+! CHECK-LABEL: vec_mergeh_test_u8
+subroutine vec_mergeh_test_u8(arg1, arg2)
+  vector(unsigned(8)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 2] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 2] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: store <2 x i64> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_u8
+
+! CHECK-LABEL: vec_mergeh_test_r4
+subroutine vec_mergeh_test_r4(arg1, arg2)
+  vector(real(4)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 4, 1, 5] : vector<4xf32>, vector<4xf32>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 4, 1, 5] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <4 x float> %[[arg1]], <4 x float> %[[arg2]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+! CHECK: store <4 x float> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_r4
+
+! CHECK-LABEL: vec_mergeh_test_r8
+subroutine vec_mergeh_test_r8(arg1, arg2)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_mergeh(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 2] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 2] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: store <2 x double> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergeh_test_r8
+
+!------------
+! vec_mergel
+!------------
+
+! CHECK-LABEL: vec_mergel_test_i1
+subroutine vec_mergel_test_i1(arg1, arg2)
+  vector(integer(1)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] : vector<16xi8>, vector<16xi8>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <16 x i8> %[[arg1]], <16 x i8> %[[arg2]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+! CHECK: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_i1
+
+! CHECK-LABEL: vec_mergel_test_i2
+subroutine vec_mergel_test_i2(arg1, arg2)
+  vector(integer(2)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [4, 12, 5, 13, 6, 14, 7, 15] : vector<8xi16>, vector<8xi16>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [4, 12, 5, 13, 6, 14, 7, 15] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[arg1:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <8 x i16> %[[arg1]], <8 x i16> %[[arg2]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+! CHECK: store <8 x i16> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_i2
+
+! CHECK-LABEL: vec_mergel_test_i4
+subroutine vec_mergel_test_i4(arg1, arg2)
+  vector(integer(4)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [2, 6, 3, 7] : vector<4xi32>, vector<4xi32>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [2, 6, 3, 7] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <4 x i32> %[[arg1]], <4 x i32> %[[arg2]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+! CHECK: store <4 x i32> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_i4
+
+! CHECK-LABEL: vec_mergel_test_i8
+subroutine vec_mergel_test_i8(arg1, arg2)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 3] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 3] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: store <2 x i64> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_i8
+
+! CHECK-LABEL: vec_mergel_test_u1
+subroutine vec_mergel_test_u1(arg1, arg2)
+  vector(unsigned(1)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] : vector<16xi8>, vector<16xi8>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<16xi8>) -> !fir.vector<16:ui8>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] : vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <16 x i8> %[[arg1]], <16 x i8> %[[arg2]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+! CHECK: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_u1
+
+! CHECK-LABEL: vec_mergel_test_u2
+subroutine vec_mergel_test_u2(arg1, arg2)
+  vector(unsigned(2)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<8:ui16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<8:ui16>) -> vector<8xi16>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [4, 12, 5, 13, 6, 14, 7, 15] : vector<8xi16>, vector<8xi16>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<8xi16>) -> !fir.vector<8:ui16>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [4, 12, 5, 13, 6, 14, 7, 15] : vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[arg1:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <8 x i16> %[[arg1]], <8 x i16> %[[arg2]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+! CHECK: store <8 x i16> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_u2
+
+! CHECK-LABEL: vec_mergel_test_u4
+subroutine vec_mergel_test_u4(arg1, arg2)
+  vector(unsigned(4)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:ui32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:ui32>) -> vector<4xi32>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [2, 6, 3, 7] : vector<4xi32>, vector<4xi32>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xi32>) -> !fir.vector<4:ui32>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [2, 6, 3, 7] : vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <4 x i32> %[[arg1]], <4 x i32> %[[arg2]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+! CHECK: store <4 x i32> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_u4
+
+! CHECK-LABEL: vec_mergel_test_u8
+subroutine vec_mergel_test_u8(arg1, arg2)
+  vector(unsigned(8)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 3] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 3] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: store <2 x i64> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_u8
+
+! CHECK-LABEL: vec_mergel_test_r4
+subroutine vec_mergel_test_r4(arg1, arg2)
+  vector(real(4)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [2, 6, 3, 7] : vector<4xf32>, vector<4xf32>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [2, 6, 3, 7] : vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <4 x float> %[[arg1]], <4 x float> %[[arg2]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+! CHECK: store <4 x float> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_r4
+
+! CHECK-LABEL: vec_mergel_test_r8
+subroutine vec_mergel_test_r8(arg1, arg2)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_mergel(arg1, arg2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[r:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 3] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[cr:.*]] = fir.convert %[[r]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[cr]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[r:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 3] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[r]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[r:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: store <2 x double> %[[r]], ptr %{{.*}}, align 16
+end subroutine vec_mergel_test_r8

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-perm-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-perm-elem-order.f90
new file mode 100644
index 00000000000000..b0f24ee1897a52
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-perm-elem-order.f90
@@ -0,0 +1,60 @@
+! RUN: bbc -emit-fir %s -fno-ppc-native-vector-element-order -o - | FileCheck --check-prefixes="FIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -fno-ppc-native-vector-element-order -o - | FileCheck --check-prefixes="LLVMIR" %s
+! REQUIRES: target=powerpc{{.*}}
+
+!----------------
+! vec_perm
+!----------------
+
+! CHECK-LABEL: vec_perm_test_i1
+subroutine vec_perm_test_i1(arg1, arg2, arg3)
+  vector(integer(1)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<16xi8> to vector<4xi32>
+! FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<16xi8> to vector<4xi32>
+! FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg1]], %[[barg2]], %[[carg3]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! FIR: %[[vcall:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! FIR: %[[bcall:.*]] = llvm.bitcast %[[vcall]] : vector<4xi32> to vector<16xi8>
+! FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+
+! LLVMIR: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! LLVMIR: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! LLVMIR: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! LLVMIR: %[[barg1:.*]] = bitcast <16 x i8> %[[arg1]] to <4 x i32>
+! LLVMIR: %[[barg2:.*]] = bitcast <16 x i8> %[[arg2]] to <4 x i32>
+! LLVMIR: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg1]], <4 x i32> %[[barg2]], <16 x i8> %[[arg3]])
+! LLVMIR: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <16 x i8>
+! LLVMIR: store <16 x i8> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_i1
+
+!----------------
+! vec_permi
+!----------------
+
+! CHECK-LABEL: vec_permi_test_i8i2
+subroutine vec_permi_test_i8i2(arg1, arg2, arg3)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 2_2)
+
+! FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [3, 0] : vector<2xi64>, vector<2xi64>
+! FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! LLVMIR: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! LLVMIR: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! LLVMIR: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 3, i32 0>
+! LLVMIR: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_i8i2

diff  --git a/flang/test/Lower/PowerPC/ppc-vec-perm.f90 b/flang/test/Lower/PowerPC/ppc-vec-perm.f90
new file mode 100644
index 00000000000000..a799b252b3a45a
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-vec-perm.f90
@@ -0,0 +1,872 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck --check-prefixes="CHECK-FIR" %s
+! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --fir-to-llvm-ir | FileCheck --check-prefixes="CHECK-LLVMIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+! CHECK-LABEL: vec_perm_test_i1
+subroutine vec_perm_test_i1(arg1, arg2, arg3)
+  vector(integer(1)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:i8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<16xi8> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<16xi8> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<16xi8>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<16xi8>) -> !fir.vector<16:i8>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<16:i8>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<16xi8> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<16xi8> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <16 x i8> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <16 x i8> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <16 x i8>
+! CHECK: store <16 x i8> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_i1
+
+! CHECK-LABEL: vec_perm_test_i2
+subroutine vec_perm_test_i2(arg1, arg2, arg3)
+  vector(integer(2)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<8:i16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<8xi16> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<8xi16> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<8xi16>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<8xi16>) -> !fir.vector<8:i16>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<8:i16>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<8xi16> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<8xi16> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[arg1:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <8 x i16> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <8 x i16> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <8 x i16>
+! CHECK: store <8 x i16> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_i2
+
+! CHECK-LABEL: vec_perm_test_i4
+subroutine vec_perm_test_i4(arg1, arg2, arg3)
+  vector(integer(4)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[carg2]], %[[carg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: fir.store %[[call]] to %{{.*}} : !fir.ref<!fir.vector<4:i32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[arg2]], %[[arg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[call]], %{{.*}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[arg2]], <4 x i32> %[[arg1]], <16 x i8> %[[xor]])
+! CHECK: store <4 x i32> %[[call]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_i4
+
+! CHECK-LABEL: vec_perm_test_i8
+subroutine vec_perm_test_i8(arg1, arg2, arg3)
+  vector(integer(8)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xi64> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xi64> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<2xi64>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<2xi64> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<2xi64> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <2 x i64> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <2 x i64> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <2 x i64>
+! CHECK: store <2 x i64> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_i8
+
+! CHECK-LABEL: vec_perm_test_u1
+subroutine vec_perm_test_u1(arg1, arg2, arg3)
+  vector(unsigned(1)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<16xi8> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<16xi8> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<16xi8>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<16xi8>) -> !fir.vector<16:ui8>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<16xi8> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<16xi8> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<16xi8>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<16xi8>>
+
+! CHECK: %[[arg1:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <16 x i8> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <16 x i8> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <16 x i8>
+! CHECK: store <16 x i8> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_u1
+
+! CHECK-LABEL: vec_perm_test_u2
+subroutine vec_perm_test_u2(arg1, arg2, arg3)
+  vector(unsigned(2)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<8:ui16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<8:ui16>) -> vector<8xi16>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<8xi16> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<8xi16> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<8xi16>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<8xi16>) -> !fir.vector<8:ui16>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<8:ui16>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<8xi16>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<8xi16> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<8xi16> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<8xi16>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<8xi16>>
+
+! CHECK: %[[arg1:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <8 x i16>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <8 x i16> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <8 x i16> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <8 x i16>
+! CHECK: store <8 x i16> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_u2
+
+! CHECK-LABEL: vec_perm_test_u4
+subroutine vec_perm_test_u4(arg1, arg2, arg3)
+  vector(unsigned(4)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:ui32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:ui32>) -> vector<4xi32>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[carg2]], %[[carg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[call2]] : (vector<4xi32>) -> !fir.vector<4:ui32>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<4:ui32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xi32>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[arg2]], %[[arg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: llvm.store %[[call]], %{{.*}} : !llvm.ptr<vector<4xi32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x i32>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[arg2]], <4 x i32> %[[arg1]], <16 x i8> %[[xor]])
+! CHECK: store <4 x i32> %[[call]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_u4
+
+! CHECK-LABEL: vec_perm_test_u8
+subroutine vec_perm_test_u8(arg1, arg2, arg3)
+  vector(unsigned(8)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xi64> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xi64> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<2xi64>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<2xi64> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<2xi64> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <2 x i64> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <2 x i64> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <2 x i64>
+! CHECK: store <2 x i64> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_u8
+
+! CHECK-LABEL: vec_perm_test_r4
+subroutine vec_perm_test_r4(arg1, arg2, arg3)
+  vector(real(4)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<4xf32> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<4xf32> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<4xf32>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<4xf32> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<4xf32> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <4 x float> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <4 x float> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <4 x float>
+! CHECK: store <4 x float> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_r4
+
+! CHECK-LABEL: vec_perm_test_r8
+subroutine vec_perm_test_r8(arg1, arg2, arg3)
+  vector(real(8)) :: arg1, arg2, r
+  vector(unsigned(1)) :: arg3
+  r = vec_perm(arg1, arg2, arg3)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg3:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<16:ui8>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg3:.*]] = fir.convert %[[arg3]] : (!fir.vector<16:ui8>) -> vector<16xi8>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xf64> to vector<4xi32>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xf64> to vector<4xi32>
+! CHECK-FIR: %[[const:.*]] = arith.constant -1 : i8
+! CHECK-FIR: %[[vconst:.*]] = vector.broadcast %[[const]] : i8 to vector<16xi8>
+! CHECK-FIR: %[[xor:.*]] = arith.xori %[[carg3]], %[[vconst]] : vector<16xi8>
+! CHECK-FIR: %[[call:.*]] = fir.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) fastmath<contract> : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> !fir.vector<4:i32>
+! CHECK-FIR: %[[call2:.*]] = fir.convert %[[call]] : (!fir.vector<4:i32>) -> vector<4xi32>
+! CHECK-FIR: %[[bcall:.*]] = llvm.bitcast %[[call2]] : vector<4xi32> to vector<2xf64>
+! CHECK-FIR: %[[ccall:.*]] = fir.convert %[[bcall]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[ccall]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg3:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<16xi8>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<2xf64> to vector<4xi32>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<2xf64> to vector<4xi32>
+! CHECK-LLVMIR: %[[const:.*]] = llvm.mlir.constant(-1 : i8) : i8
+! CHECK-LLVMIR: %[[vconst:.*]] = llvm.mlir.constant(dense<-1> : vector<16xi8>) : vector<16xi8>
+! CHECK-LLVMIR: %[[xor:.*]] = llvm.xor %[[arg3]], %[[vconst]]  : vector<16xi8>
+! CHECK-LLVMIR: %[[call:.*]] = llvm.call @llvm.ppc.altivec.vperm(%[[barg2]], %[[barg1]], %[[xor]]) {fastmathFlags = #llvm.fastmath<contract>} : (vector<4xi32>, vector<4xi32>, vector<16xi8>) -> vector<4xi32>
+! CHECK-LLVMIR: %[[bcall:.*]] = llvm.bitcast %[[call]] : vector<4xi32> to vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[bcall]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg3:.*]] = load <16 x i8>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <2 x double> %[[arg1]] to <4 x i32>
+! CHECK: %[[barg2:.*]] = bitcast <2 x double> %[[arg2]] to <4 x i32>
+! CHECK: %[[xor:.*]] = xor <16 x i8> %[[arg3]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+! CHECK: %[[call:.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> %[[barg2]], <4 x i32> %[[barg1]], <16 x i8> %[[xor]])
+! CHECK: %[[bcall:.*]] = bitcast <4 x i32> %[[call]] to <2 x double>
+! CHECK: store <2 x double> %[[bcall]], ptr %{{.*}}, align 16
+end subroutine vec_perm_test_r8
+
+! CHECK-LABEL: vec_permi_test_i8i1
+subroutine vec_permi_test_i8i1(arg1, arg2, arg3)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 3_1)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 3] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 3] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_i8i1
+
+! CHECK-LABEL: vec_permi_test_i8i2
+subroutine vec_permi_test_i8i2(arg1, arg2, arg3)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 2_2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 2] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 2] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 1, i32 2>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_i8i2
+
+! CHECK-LABEL: vec_permi_test_i8i4
+subroutine vec_permi_test_i8i4(arg1, arg2, arg3)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 1_4)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 3] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 3] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 0, i32 3>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_i8i4
+
+! CHECK-LABEL: vec_permi_test_i8i8
+subroutine vec_permi_test_i8i8(arg1, arg2, arg3)
+  vector(integer(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 0_8)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:i64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 2] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:i64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:i64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 2] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_i8i8
+
+! CHECK-LABEL: vec_permi_test_u8i1
+subroutine vec_permi_test_u8i1(arg1, arg2, arg3)
+  vector(unsigned(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 3_1)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 3] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 3] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_u8i1
+
+! CHECK-LABEL: vec_permi_test_u8i2
+subroutine vec_permi_test_u8i2(arg1, arg2, arg3)
+  vector(unsigned(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 2_2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [1, 2] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 2] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 1, i32 2>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_u8i2
+
+! CHECK-LABEL: vec_permi_test_u8i4
+subroutine vec_permi_test_u8i4(arg1, arg2, arg3)
+  vector(unsigned(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 1_4)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 3] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 3] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 0, i32 3>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_u8i4
+
+! CHECK-LABEL: vec_permi_test_u8i8
+subroutine vec_permi_test_u8i8(arg1, arg2, arg3)
+  vector(unsigned(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 0_8)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:ui64>) -> vector<2xi64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[carg1]], %[[carg2]] [0, 2] : vector<2xi64>, vector<2xi64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xi64>) -> !fir.vector<2:ui64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:ui64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %arg0 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %arg1 : !llvm.ptr<vector<2xi64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 2] : vector<2xi64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xi64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x i64>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x i64> %[[arg1]], <2 x i64> %[[arg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: store <2 x i64> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_u8i8
+
+! CHECK-LABEL: vec_permi_test_r4i1
+subroutine vec_permi_test_r4i1(arg1, arg2, arg3)
+  vector(real(4)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 3_1)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [1, 3] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[bshuf]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[barg1]], %[[barg2]] [1, 3] : vector<2xf64>
+! CHECK-LLVMIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[bshuf]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <4 x float> %[[arg1]] to <2 x double>
+! CHECK: %[[barg2:.*]] = bitcast <4 x float> %[[arg2]] to <2 x double>
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[barg1]], <2 x double> %[[barg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: %[[bshuf:.*]] = bitcast <2 x double> %[[shuf]] to <4 x float>
+! CHECK: store <4 x float> %[[bshuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r4i1
+
+! CHECK-LABEL: vec_permi_test_r4i2
+subroutine vec_permi_test_r4i2(arg1, arg2, arg3)
+  vector(real(4)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 2_2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [1, 2] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[bshuf]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[barg1]], %[[barg2]] [1, 2] : vector<2xf64>
+! CHECK-LLVMIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[bshuf]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <4 x float> %[[arg1]] to <2 x double>
+! CHECK: %[[barg2:.*]] = bitcast <4 x float> %[[arg2]] to <2 x double>
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[barg1]], <2 x double> %[[barg2]], <2 x i32> <i32 1, i32 2>
+! CHECK: %[[bshuf:.*]] = bitcast <2 x double> %[[shuf]] to <4 x float>
+! CHECK: store <4 x float> %[[bshuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r4i2
+
+! CHECK-LABEL: vec_permi_test_r4i4
+subroutine vec_permi_test_r4i4(arg1, arg2, arg3)
+  vector(real(4)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 1_4)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [0, 3] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[bshuf]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[barg1]], %[[barg2]] [0, 3] : vector<2xf64>
+! CHECK-LLVMIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[bshuf]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <4 x float> %[[arg1]] to <2 x double>
+! CHECK: %[[barg2:.*]] = bitcast <4 x float> %[[arg2]] to <2 x double>
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[barg1]], <2 x double> %[[barg2]], <2 x i32> <i32 0, i32 3>
+! CHECK: %[[bshuf:.*]] = bitcast <2 x double> %[[shuf]] to <4 x float>
+! CHECK: store <4 x float> %[[bshuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r4i4
+
+! CHECK-LABEL: vec_permi_test_r4i8
+subroutine vec_permi_test_r4i8(arg1, arg2, arg3)
+  vector(real(4)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 0_8)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<4:f32>) -> vector<4xf32>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [0, 2] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[bshuf]] : (vector<4xf32>) -> !fir.vector<4:f32>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<4:f32>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<4xf32>>
+! CHECK-LLVMIR: %[[barg1:.*]] = llvm.bitcast %[[arg1]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[barg2:.*]] = llvm.bitcast %[[arg2]] : vector<4xf32> to vector<2xf64>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[barg1]], %[[barg2]] [0, 2] : vector<2xf64>
+! CHECK-LLVMIR: %[[bshuf:.*]] = llvm.bitcast %[[shuf]] : vector<2xf64> to vector<4xf32>
+! CHECK-LLVMIR: llvm.store %[[bshuf]], %{{.*}} : !llvm.ptr<vector<4xf32>>
+
+! CHECK: %[[arg1:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <4 x float>, ptr %{{.*}}, align 16
+! CHECK: %[[barg1:.*]] = bitcast <4 x float> %[[arg1]] to <2 x double>
+! CHECK: %[[barg2:.*]] = bitcast <4 x float> %[[arg2]] to <2 x double>
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[barg1]], <2 x double> %[[barg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: %[[bshuf:.*]] = bitcast <2 x double> %[[shuf]] to <4 x float>
+! CHECK: store <4 x float> %[[bshuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r4i8
+
+! CHECK-LABEL: vec_permi_test_r8i1
+subroutine vec_permi_test_r8i1(arg1, arg2, arg3)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 3_1)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [1, 3] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 3] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 1, i32 3>
+! CHECK: store <2 x double> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r8i1
+
+! CHECK-LABEL: vec_permi_test_r8i2
+subroutine vec_permi_test_r8i2(arg1, arg2, arg3)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 2_2)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [1, 2] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [1, 2] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 1, i32 2>
+! CHECK: store <2 x double> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r8i2
+
+! CHECK-LABEL: vec_permi_test_r8i4
+subroutine vec_permi_test_r8i4(arg1, arg2, arg3)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 1_4)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [0, 3] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 3] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 0, i32 3>
+! CHECK: store <2 x double> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r8i4
+
+! CHECK-LABEL: vec_permi_test_r8i8
+subroutine vec_permi_test_r8i8(arg1, arg2, arg3)
+  vector(real(8)) :: arg1, arg2, r
+  r = vec_permi(arg1, arg2, 0_8)
+
+! CHECK-FIR: %[[arg1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[arg2:.*]] = fir.load %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+! CHECK-FIR: %[[carg1:.*]] = fir.convert %[[arg1]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[carg2:.*]] = fir.convert %[[arg2]] : (!fir.vector<2:f64>) -> vector<2xf64>
+! CHECK-FIR: %[[barg1:.*]] = llvm.bitcast %[[carg1]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[barg2:.*]] = llvm.bitcast %[[carg2]] : vector<2xf64> to vector<2xf64>
+! CHECK-FIR: %[[shuf:.*]] = vector.shuffle %[[barg1]], %[[barg2]] [0, 2] : vector<2xf64>, vector<2xf64>
+! CHECK-FIR: %[[cshuf:.*]] = fir.convert %[[shuf]] : (vector<2xf64>) -> !fir.vector<2:f64>
+! CHECK-FIR: fir.store %[[cshuf]] to %{{.*}} : !fir.ref<!fir.vector<2:f64>>
+
+! CHECK-LLVMIR: %[[arg1:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[arg2:.*]] = llvm.load %{{.*}} : !llvm.ptr<vector<2xf64>>
+! CHECK-LLVMIR: %[[shuf:.*]] = llvm.shufflevector %[[arg1]], %[[arg2]] [0, 2] : vector<2xf64>
+! CHECK-LLVMIR: llvm.store %[[shuf]], %{{.*}} : !llvm.ptr<vector<2xf64>>
+
+! CHECK: %[[arg1:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[arg2:.*]] = load <2 x double>, ptr %{{.*}}, align 16
+! CHECK: %[[shuf:.*]] = shufflevector <2 x double> %[[arg1]], <2 x double> %[[arg2]], <2 x i32> <i32 0, i32 2>
+! CHECK: store <2 x double> %[[shuf]], ptr %{{.*}}, align 16
+end subroutine vec_permi_test_r8i8

diff  --git a/flang/test/Semantics/PowerPC/ppc-vector-intrinsics.f90 b/flang/test/Semantics/PowerPC/ppc-vector-intrinsics.f90
index 48866151648c9e..04244687ec374c 100644
--- a/flang/test/Semantics/PowerPC/ppc-vector-intrinsics.f90
+++ b/flang/test/Semantics/PowerPC/ppc-vector-intrinsics.f90
@@ -21,3 +21,12 @@ program test
 ! ERROR: Argument #2 must be a constant expression in range 0-31
   rr = vec_ctf(arg1, 37)
 end program test
+
+subroutine test_vec_permi()
+  vector(integer(8)) :: arg1, arg2, r
+  integer :: arg3
+!ERROR: Actual argument #3 must be a constant expression
+  r = vec_permi(arg1, arg2, arg3)
+! ERROR: Argument #3 must be a constant expression in range 0-3
+  r = vec_permi(arg1, arg2, 11)
+end


        


More information about the flang-commits mailing list