[llvm] 410626c - [VE] Support vld intrinsics

Thu Nov 12 14:34:55 PST 2020

Author: Kazushi (Jam) Marukawa
Date: 2020-11-13T07:34:42+09:00
New Revision: 410626c9b56a2844652dcff9ca23765a6918186f

URL: https://github.com/llvm/llvm-project/commit/410626c9b56a2844652dcff9ca23765a6918186f
DIFF: https://github.com/llvm/llvm-project/commit/410626c9b56a2844652dcff9ca23765a6918186f.diff

LOG: [VE] Support vld intrinsics

Add intrinsics for vector load instructions.  Add a regression test also.

Reviewed By: simoll

Differential Revision: https://reviews.llvm.org/D91332

Added: 
    llvm/include/llvm/IR/IntrinsicsVE.td
    llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
    llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
    llvm/lib/Target/VE/VEInstrIntrinsicVL.td
    llvm/test/CodeGen/VE/VELIntrinsics/lit.local.cfg
    llvm/test/CodeGen/VE/VELIntrinsics/vld.ll

Modified: 
    llvm/include/llvm/IR/CMakeLists.txt
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/IR/Function.cpp
    llvm/lib/Target/VE/VEInstrInfo.td

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/CMakeLists.txt b/llvm/include/llvm/IR/CMakeLists.txt
index c8edc29bd887..0498fc269b63 100644

--- a/llvm/include/llvm/IR/CMakeLists.txt
+++ b/llvm/include/llvm/IR/CMakeLists.txt
@@ -18,4 +18,5 @@ tablegen(LLVM IntrinsicsS390.h -gen-intrinsic-enums -intrinsic-prefix=s390)
 tablegen(LLVM IntrinsicsWebAssembly.h -gen-intrinsic-enums -intrinsic-prefix=wasm)
 tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86)
 tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore)
+tablegen(LLVM IntrinsicsVE.h -gen-intrinsic-enums -intrinsic-prefix=ve)
 add_public_tablegen_target(intrinsics_gen)

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e0f3d67a62dd..81e0340b0429 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1649,3 +1649,4 @@ include "llvm/IR/IntrinsicsBPF.td"
 include "llvm/IR/IntrinsicsSystemZ.td"
 include "llvm/IR/IntrinsicsWebAssembly.td"
 include "llvm/IR/IntrinsicsRISCV.td"
+include "llvm/IR/IntrinsicsVE.td"

diff  --git a/llvm/include/llvm/IR/IntrinsicsVE.td b/llvm/include/llvm/IR/IntrinsicsVE.td
new file mode 100644
index 000000000000..1cb7a2e1eaf4
--- /dev/null
+++ b/llvm/include/llvm/IR/IntrinsicsVE.td
@@ -0,0 +1,4 @@
+// Define intrinsics written by hand
+
+// Define intrinsics automatically generated
+include "llvm/IR/IntrinsicsVEVL.gen.td"

diff  --git a/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td b/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
new file mode 100644
index 000000000000..c4002a2a3b62
--- /dev/null
+++ b/llvm/include/llvm/IR/IntrinsicsVEVL.gen.td
@@ -0,0 +1,32 @@
+let TargetPrefix = "ve" in def int_ve_vl_vld_vssl : GCCBuiltin<"__builtin_ve_vl_vld_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld_vssvl : GCCBuiltin<"__builtin_ve_vl_vld_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu_vssl : GCCBuiltin<"__builtin_ve_vl_vldu_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssl : GCCBuiltin<"__builtin_ve_vl_vldunc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldunc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldunc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssl : GCCBuiltin<"__builtin_ve_vl_vldlsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldlsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlsxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssl : GCCBuiltin<"__builtin_ve_vl_vldlzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldlzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldlzxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldlzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssl : GCCBuiltin<"__builtin_ve_vl_vld2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2d_vssvl : GCCBuiltin<"__builtin_ve_vl_vld2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vld2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vld2dnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vld2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssl : GCCBuiltin<"__builtin_ve_vl_vldu2d_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2d_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu2d_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldu2dnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldu2dnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldu2dnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dsx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dsx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dsxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dsxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dzx_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzx_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dzx_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssl : GCCBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<i32>], [IntrReadMem]>;
+let TargetPrefix = "ve" in def int_ve_vl_vldl2dzxnc_vssvl : GCCBuiltin<"__builtin_ve_vl_vldl2dzxnc_vssvl">, Intrinsic<[LLVMType<v256f64>], [LLVMType<i64>, llvm_ptr_ty, LLVMType<v256f64>, LLVMType<i32>], [IntrReadMem]>;

diff  --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 8c360dd1841e..a316c7c9c068 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/IntrinsicsVE.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsXCore.h"

diff  --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index acfa680223c2..1a15058cf6c4 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -2221,3 +2221,6 @@ def : Pat<(i64 (and i64:$val, 0xffffffff)),
 
 // Vector instructions.
 include "VEInstrVec.td"
+
+// The vevlintrin
+include "VEInstrIntrinsicVL.td"

diff  --git a/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td b/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
new file mode 100644
index 000000000000..fbb350271561
--- /dev/null
+++ b/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
@@ -0,0 +1,64 @@
+def : Pat<(int_ve_vl_vld_vssl i64:$sy, i64:$sz, i32:$vl), (VLDrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld_vssl simm7:$I, i64:$sz, i32:$vl), (VLDirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu_vssl i64:$sy, i64:$sz, i32:$vl), (VLDUrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDUrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu_vssl simm7:$I, i64:$sz, i32:$vl), (VLDUirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDUirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldunc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDUNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldunc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDUNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldunc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDUNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldunc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDUNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLSXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLSXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLSXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLSXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLZXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLZXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLZXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLZXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2d_vssl i64:$sy, i64:$sz, i32:$vl), (VLD2Drrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2d_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLD2Drrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2d_vssl simm7:$I, i64:$sz, i32:$vl), (VLD2Dirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2d_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLD2Dirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2dnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLD2DNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2dnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLD2DNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2dnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLD2DNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2dnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLD2DNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2d_vssl i64:$sy, i64:$sz, i32:$vl), (VLDU2Drrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2d_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2Drrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2d_vssl simm7:$I, i64:$sz, i32:$vl), (VLDU2Dirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2d_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2Dirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDU2DNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2DNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDU2DNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2DNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DSXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DSXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DSXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DSXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DZXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DZXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DZXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DZXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;

diff  --git a/llvm/lib/Target/VE/VEInstrIntrinsicVL.td b/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
new file mode 100644
index 000000000000..c8d253ef65ff
--- /dev/null
+++ b/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
@@ -0,0 +1,6 @@
+// Pattern Matchings for VEL Intrinsics
+
+// Define intrinsics written by hand
+
+// Define intrinsics automatically generated
+include "VEInstrIntrinsicVL.gen.td"

diff  --git a/llvm/test/CodeGen/VE/VELIntrinsics/lit.local.cfg b/llvm/test/CodeGen/VE/VELIntrinsics/lit.local.cfg
new file mode 100644
index 000000000000..b6366779272d
--- /dev/null
+++ b/llvm/test/CodeGen/VE/VELIntrinsics/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'VE' in config.root.targets:
+    config.unsupported = True

diff  --git a/llvm/test/CodeGen/VE/VELIntrinsics/vld.ll b/llvm/test/CodeGen/VE/VELIntrinsics/vld.ll
new file mode 100644
index 000000000000..c57ec4338038
--- /dev/null
+++ b/llvm/test/CodeGen/VE/VELIntrinsics/vld.ll
@@ -0,0 +1,1126 @@
+; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s
+
+;;; Test vector load intrinsic instructions
+;;;
+;;; Note:
+;;;   We test VLD*rrl, VLD*irl, VLD*rrl_v, VLD*irl_v
+
+; Function Attrs: nounwind
+define void @vld_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vld_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vld.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vld_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vld_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vld %v0, %s1, %s2
+; CHECK-NEXT:    vld %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vld.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vld.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vld_vssl_imm(i8* %0) {
+; CHECK-LABEL: vld_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vld %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vld_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vld_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld %v0, 8, %s1
+; CHECK-NEXT:    vld %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vld.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vld.nc %v0, %s1, %s2
+; CHECK-NEXT:    vld.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vld.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld.nc %v0, 8, %s1
+; CHECK-NEXT:    vld.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldu_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldu_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldu.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldu.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldu_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldu_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldu %v0, %s1, %s2
+; CHECK-NEXT:    vldu %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldu.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldu.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldu.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldu_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldu_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldu %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldu.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldu_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldu_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu %v0, 8, %s1
+; CHECK-NEXT:    vldu %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldu.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldu.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldunc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldunc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldunc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldunc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldunc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldunc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldu.nc %v0, %s1, %s2
+; CHECK-NEXT:    vldu.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldunc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldunc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldunc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldunc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldunc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldu.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldunc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldunc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldunc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu.nc %v0, 8, %s1
+; CHECK-NEXT:    vldu.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldunc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldunc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlsx_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldlsx_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.sx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlsx.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlsx.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldlsx_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldlsx_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl.sx %v0, %s1, %s2
+; CHECK-NEXT:    vldl.sx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlsx.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldlsx.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlsx.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldlsx_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldlsx_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl.sx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldlsx.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlsx_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldlsx_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.sx %v0, 8, %s1
+; CHECK-NEXT:    vldl.sx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlsx.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlsx.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlsxnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldlsxnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.sx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlsxnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlsxnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldlsxnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldlsxnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl.sx.nc %v0, %s1, %s2
+; CHECK-NEXT:    vldl.sx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlsxnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldlsxnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlsxnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldlsxnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldlsxnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl.sx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldlsxnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlsxnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldlsxnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.sx.nc %v0, 8, %s1
+; CHECK-NEXT:    vldl.sx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlsxnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlsxnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlzx_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldlzx_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.zx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlzx.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlzx.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldlzx_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldlzx_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl.zx %v0, %s1, %s2
+; CHECK-NEXT:    vldl.zx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlzx.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldlzx.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlzx.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldlzx_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldlzx_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl.zx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldlzx.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlzx_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldlzx_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.zx %v0, 8, %s1
+; CHECK-NEXT:    vldl.zx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlzx.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlzx.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlzxnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldlzxnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.zx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlzxnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlzxnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldlzxnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldlzxnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl.zx.nc %v0, %s1, %s2
+; CHECK-NEXT:    vldl.zx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlzxnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldlzxnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldlzxnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldlzxnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldlzxnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl.zx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldlzxnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldlzxnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldlzxnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl.zx.nc %v0, 8, %s1
+; CHECK-NEXT:    vldl.zx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldlzxnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldlzxnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vld2d_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vld2d_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld2d %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vld2d.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vld2d.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vld2d_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vld2d_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vld2d %v0, %s1, %s2
+; CHECK-NEXT:    vld2d %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vld2d.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vld2d.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vld2d.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vld2d_vssl_imm(i8* %0) {
+; CHECK-LABEL: vld2d_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vld2d %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vld2d.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vld2d_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vld2d_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld2d %v0, 8, %s1
+; CHECK-NEXT:    vld2d %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vld2d.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vld2d.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vld2dnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vld2dnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld2d.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vld2dnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vld2dnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vld2dnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vld2dnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vld2d.nc %v0, %s1, %s2
+; CHECK-NEXT:    vld2d.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vld2dnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vld2dnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vld2dnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vld2dnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vld2dnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vld2d.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vld2dnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vld2dnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vld2dnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vld2d.nc %v0, 8, %s1
+; CHECK-NEXT:    vld2d.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vld2dnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vld2dnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldu2d_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldu2d_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu2d %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldu2d.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldu2d.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldu2d_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldu2d_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldu2d %v0, %s1, %s2
+; CHECK-NEXT:    vldu2d %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldu2d.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldu2d.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldu2d.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldu2d_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldu2d_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldu2d %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldu2d.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldu2d_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldu2d_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu2d %v0, 8, %s1
+; CHECK-NEXT:    vldu2d %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldu2d.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldu2d.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldu2dnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldu2dnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu2d.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldu2dnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldu2dnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldu2dnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldu2dnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldu2d.nc %v0, %s1, %s2
+; CHECK-NEXT:    vldu2d.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldu2dnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldu2dnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldu2dnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldu2dnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldu2dnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldu2d.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldu2dnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldu2dnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldu2dnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldu2d.nc %v0, 8, %s1
+; CHECK-NEXT:    vldu2d.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldu2dnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldu2dnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dsx_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldl2dsx_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.sx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsx.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dsx.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dsx_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldl2dsx_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl2d.sx %v0, %s1, %s2
+; CHECK-NEXT:    vldl2d.sx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsx.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsx.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dsx.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dsx_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldl2dsx_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl2d.sx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsx.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dsx_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldl2dsx_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.sx %v0, 8, %s1
+; CHECK-NEXT:    vldl2d.sx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsx.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsx.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dsxnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldl2dsxnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.sx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsxnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dsxnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dsxnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldl2dsxnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl2d.sx.nc %v0, %s1, %s2
+; CHECK-NEXT:    vldl2d.sx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsxnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsxnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dsxnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dsxnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldl2dsxnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl2d.sx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsxnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dsxnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldl2dsxnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.sx.nc %v0, 8, %s1
+; CHECK-NEXT:    vldl2d.sx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsxnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dsxnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dzx_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldl2dzx_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.zx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzx.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dzx.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dzx_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldl2dzx_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl2d.zx %v0, %s1, %s2
+; CHECK-NEXT:    vldl2d.zx %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzx.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzx.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dzx.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dzx_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldl2dzx_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl2d.zx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzx.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dzx_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldl2dzx_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.zx %v0, 8, %s1
+; CHECK-NEXT:    vldl2d.zx %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzx.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzx.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dzxnc_vssl(i8* %0, i64 %1) {
+; CHECK-LABEL: vldl2dzxnc_vssl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.zx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzxnc.vssl(i64 %1, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %3, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dzxnc.vssl(i64, i8*, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dzxnc_vssvl(i8* %0, i64 %1, i8* %2) {
+; CHECK-LABEL: vldl2dzxnc_vssvl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s3, 256
+; CHECK-NEXT:    vldl2d.zx.nc %v0, %s1, %s2
+; CHECK-NEXT:    vldl2d.zx.nc %v0, %s1, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, %s1, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzxnc.vssl(i64 %1, i8* %2, i32 256)
+  %5 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzxnc.vssvl(i64 %1, i8* %0, <256 x double> %4, i32 256)
+  tail call void asm sideeffect "vst $0, $1, $2", "v,r,r"(<256 x double> %5, i64 %1, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <256 x double> @llvm.ve.vl.vldl2dzxnc.vssvl(i64, i8*, <256 x double>, i32)
+
+; Function Attrs: nounwind
+define void @vldl2dzxnc_vssl_imm(i8* %0) {
+; CHECK-LABEL: vldl2dzxnc_vssl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    vldl2d.zx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %2 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzxnc.vssl(i64 8, i8* %0, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %2, i8* %0)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @vldl2dzxnc_vssvl_imm(i8* %0, i8* %1) {
+; CHECK-LABEL: vldl2dzxnc_vssvl_imm:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    vldl2d.zx.nc %v0, 8, %s1
+; CHECK-NEXT:    vldl2d.zx.nc %v0, 8, %s0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or %s11, 0, %s9
+  %3 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzxnc.vssl(i64 8, i8* %1, i32 256)
+  %4 = tail call fast <256 x double> @llvm.ve.vl.vldl2dzxnc.vssvl(i64 8, i8* %0, <256 x double> %3, i32 256)
+  tail call void asm sideeffect "vst $0, 8, $1", "v,r"(<256 x double> %4, i8* %0)
+  ret void
+}