[clang] c61eb44 - [SystemZ] Implement vector rotate in terms of funnel shift

Mon Dec 4 07:52:21 PST 2023

Author: Ulrich Weigand
Date: 2023-12-04T16:52:00+01:00
New Revision: c61eb440059d6e9c18e6f8404e06bf125aa942c9

URL: https://github.com/llvm/llvm-project/commit/c61eb440059d6e9c18e6f8404e06bf125aa942c9
DIFF: https://github.com/llvm/llvm-project/commit/c61eb440059d6e9c18e6f8404e06bf125aa942c9.diff

LOG: [SystemZ] Implement vector rotate in terms of funnel shift

Clang currently implements a set of vector rotate builtins
(__builtin_s390_verll*) in terms of platform-specific LLVM
intrinsics.  To simplify the IR (and allow for common code
optimizations if applicable), this patch removes those LLVM
intrinsics and implements the builtins in terms of the
platform-independent funnel shift intrinsics instead.

Also, fix the prototype of the __builtin_s390_verll*
builtins for full compatibility with GCC.

Added: 
    llvm/test/CodeGen/SystemZ/vec-rot-01.ll
    llvm/test/CodeGen/SystemZ/vec-rot-02.ll

Modified: 
    clang/include/clang/Basic/BuiltinsSystemZ.def
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/vecintrin.h
    clang/test/CodeGen/SystemZ/builtins-systemz-vector.c
    clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
    llvm/include/llvm/IR/IntrinsicsSystemZ.td
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZISelLowering.h
    llvm/lib/Target/SystemZ/SystemZInstrVector.td
    llvm/lib/Target/SystemZ/SystemZOperators.td
    llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsSystemZ.def b/clang/include/clang/Basic/BuiltinsSystemZ.def
index 079e411364885..b84cf5b9cec9f 100644

--- a/clang/include/clang/Basic/BuiltinsSystemZ.def
+++ b/clang/include/clang/Basic/BuiltinsSystemZ.def
@@ -105,10 +105,10 @@ TARGET_BUILTIN(__builtin_s390_verimb, "V16UcV16UcV16UcV16UcIi", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_verimh, "V8UsV8UsV8UsV8UsIi", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_verimf, "V4UiV4UiV4UiV4UiIi", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_verimg, "V2ULLiV2ULLiV2ULLiV2ULLiIi", "nc", "vector")
-TARGET_BUILTIN(__builtin_s390_verllb, "V16UcV16UcUi", "nc", "vector")
-TARGET_BUILTIN(__builtin_s390_verllh, "V8UsV8UsUi", "nc", "vector")
-TARGET_BUILTIN(__builtin_s390_verllf, "V4UiV4UiUi", "nc", "vector")
-TARGET_BUILTIN(__builtin_s390_verllg, "V2ULLiV2ULLiUi", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllb, "V16UcV16UcUc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllh, "V8UsV8UsUc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllf, "V4UiV4UiUc", "nc", "vector")
+TARGET_BUILTIN(__builtin_s390_verllg, "V2ULLiV2ULLiUc", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_verllvb, "V16UcV16UcV16Uc", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_verllvh, "V8UsV8UsV8Us", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_verllvf, "V4UiV4UiV4Ui", "nc", "vector")

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 65d9862621061..a0f4172002613 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18337,6 +18337,32 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, {X, Undef});
   }
 
+  case SystemZ::BI__builtin_s390_verllb:
+  case SystemZ::BI__builtin_s390_verllh:
+  case SystemZ::BI__builtin_s390_verllf:
+  case SystemZ::BI__builtin_s390_verllg: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Amt = EmitScalarExpr(E->getArg(1));
+    // Splat scalar rotate amount to vector type.
+    unsigned NumElts = cast<llvm::FixedVectorType>(ResultType)->getNumElements();
+    Amt = Builder.CreateIntCast(Amt, ResultType->getScalarType(), false);
+    Amt = Builder.CreateVectorSplat(NumElts, Amt);
+    Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType);
+    return Builder.CreateCall(F, { Src, Src, Amt });
+  }
+
+  case SystemZ::BI__builtin_s390_verllvb:
+  case SystemZ::BI__builtin_s390_verllvh:
+  case SystemZ::BI__builtin_s390_verllvf:
+  case SystemZ::BI__builtin_s390_verllvg: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Amt = EmitScalarExpr(E->getArg(1));
+    Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType);
+    return Builder.CreateCall(F, { Src, Src, Amt });
+  }
+
   case SystemZ::BI__builtin_s390_vfsqsb:
   case SystemZ::BI__builtin_s390_vfsqdb: {
     llvm::Type *ResultType = ConvertType(E->getType());

diff  --git a/clang/lib/Headers/vecintrin.h b/clang/lib/Headers/vecintrin.h
index ec1dbfd015f6e..0c535225c78e5 100644
--- a/clang/lib/Headers/vecintrin.h
+++ b/clang/lib/Headers/vecintrin.h
@@ -6565,45 +6565,45 @@ vec_rl(__vector unsigned long long __a, __vector unsigned long long __b) {
 static inline __ATTRS_o_ai __vector signed char
 vec_rli(__vector signed char __a, unsigned long __b) {
   return (__vector signed char)__builtin_s390_verllb(
-    (__vector unsigned char)__a, (int)__b);
+    (__vector unsigned char)__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector unsigned char
 vec_rli(__vector unsigned char __a, unsigned long __b) {
-  return __builtin_s390_verllb(__a, (int)__b);
+  return __builtin_s390_verllb(__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector signed short
 vec_rli(__vector signed short __a, unsigned long __b) {
   return (__vector signed short)__builtin_s390_verllh(
-    (__vector unsigned short)__a, (int)__b);
+    (__vector unsigned short)__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector unsigned short
 vec_rli(__vector unsigned short __a, unsigned long __b) {
-  return __builtin_s390_verllh(__a, (int)__b);
+  return __builtin_s390_verllh(__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector signed int
 vec_rli(__vector signed int __a, unsigned long __b) {
   return (__vector signed int)__builtin_s390_verllf(
-    (__vector unsigned int)__a, (int)__b);
+    (__vector unsigned int)__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector unsigned int
 vec_rli(__vector unsigned int __a, unsigned long __b) {
-  return __builtin_s390_verllf(__a, (int)__b);
+  return __builtin_s390_verllf(__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector signed long long
 vec_rli(__vector signed long long __a, unsigned long __b) {
   return (__vector signed long long)__builtin_s390_verllg(
-    (__vector unsigned long long)__a, (int)__b);
+    (__vector unsigned long long)__a, (unsigned char)__b);
 }
 
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_rli(__vector unsigned long long __a, unsigned long __b) {
-  return __builtin_s390_verllg(__a, (int)__b);
+  return __builtin_s390_verllg(__a, (unsigned char)__b);
 }
 
 /*-- vec_rl_mask ------------------------------------------------------------*/

diff  --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c
index f01813ee76034..d17daaf35ca4b 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c
@@ -23,6 +23,7 @@ volatile vec_ulong vul;
 volatile vec_double vd;
 
 volatile unsigned int len;
+volatile unsigned char amt;
 const void * volatile cptr;
 void * volatile ptr;
 int cc;
@@ -184,23 +185,23 @@ void test_integer(void) {
   vul = __builtin_s390_verimg(vul, vul, vul, 255);
   // CHECK: call <2 x i64> @llvm.s390.verimg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i32 255)
 
-  vuc = __builtin_s390_verllb(vuc, len);
-  // CHECK: call <16 x i8> @llvm.s390.verllb(<16 x i8> %{{.*}}, i32 %{{.*}})
-  vus = __builtin_s390_verllh(vus, len);
-  // CHECK: call <8 x i16> @llvm.s390.verllh(<8 x i16> %{{.*}}, i32 %{{.*}})
-  vui = __builtin_s390_verllf(vui, len);
-  // CHECK: call <4 x i32> @llvm.s390.verllf(<4 x i32> %{{.*}}, i32 %{{.*}})
-  vul = __builtin_s390_verllg(vul, len);
-  // CHECK: call <2 x i64> @llvm.s390.verllg(<2 x i64> %{{.*}}, i32 %{{.*}})
+  vuc = __builtin_s390_verllb(vuc, amt);
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vus = __builtin_s390_verllh(vus, amt);
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  vui = __builtin_s390_verllf(vui, amt);
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  vul = __builtin_s390_verllg(vul, amt);
+  // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
 
   vuc = __builtin_s390_verllvb(vuc, vuc);
-  // CHECK: call <16 x i8> @llvm.s390.verllvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   vus = __builtin_s390_verllvh(vus, vus);
-  // CHECK: call <8 x i16> @llvm.s390.verllvh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   vui = __builtin_s390_verllvf(vui, vui);
-  // CHECK: call <4 x i32> @llvm.s390.verllvf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   vul = __builtin_s390_verllvg(vul, vul);
-  // CHECK: call <2 x i64> @llvm.s390.verllvg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
 
   vus = __builtin_s390_vgfmb(vuc, vuc);
   // CHECK: call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})

diff  --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index 44f8cbe2cc017..0dc2fa7c66dd2 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -2564,53 +2564,53 @@ void test_integer(void) {
   // (emulated)
 
   vsc = vec_rl(vsc, vuc);
-  // CHECK: call <16 x i8> @llvm.s390.verllvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: verllvb
   vuc = vec_rl(vuc, vuc);
-  // CHECK: call <16 x i8> @llvm.s390.verllvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: verllvb
   vss = vec_rl(vss, vus);
-  // CHECK: call <8 x i16> @llvm.s390.verllvh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK-ASM: verllvh
   vus = vec_rl(vus, vus);
-  // CHECK: call <8 x i16> @llvm.s390.verllvh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK-ASM: verllvh
   vsi = vec_rl(vsi, vui);
-  // CHECK: call <4 x i32> @llvm.s390.verllvf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: verllvf
   vui = vec_rl(vui, vui);
-  // CHECK: call <4 x i32> @llvm.s390.verllvf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: verllvf
   vsl = vec_rl(vsl, vul);
-  // CHECK: call <2 x i64> @llvm.s390.verllvg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: verllvg
   vul = vec_rl(vul, vul);
-  // CHECK: call <2 x i64> @llvm.s390.verllvg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: verllvg
 
   vsc = vec_rli(vsc, ul);
-  // CHECK: call <16 x i8> @llvm.s390.verllb(<16 x i8> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: verllb
   vuc = vec_rli(vuc, ul);
-  // CHECK: call <16 x i8> @llvm.s390.verllb(<16 x i8> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK-ASM: verllb
   vss = vec_rli(vss, ul);
-  // CHECK: call <8 x i16> @llvm.s390.verllh(<8 x i16> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK-ASM: verllh
   vus = vec_rli(vus, ul);
-  // CHECK: call <8 x i16> @llvm.s390.verllh(<8 x i16> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK-ASM: verllh
   vsi = vec_rli(vsi, ul);
-  // CHECK: call <4 x i32> @llvm.s390.verllf(<4 x i32> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: verllf
   vui = vec_rli(vui, ul);
-  // CHECK: call <4 x i32> @llvm.s390.verllf(<4 x i32> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   // CHECK-ASM: verllf
   vsl = vec_rli(vsl, ul);
-  // CHECK: call <2 x i64> @llvm.s390.verllg(<2 x i64> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: verllg
   vul = vec_rli(vul, ul);
-  // CHECK: call <2 x i64> @llvm.s390.verllg(<2 x i64> %{{.*}}, i32 %{{.*}})
+  // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK-ASM: verllg
 
   vsc = vec_rl_mask(vsc, vuc, 0);

diff  --git a/llvm/include/llvm/IR/IntrinsicsSystemZ.td b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
index 9d21f3eb5352e..9f79bdfa9d2d2 100644
--- a/llvm/include/llvm/IR/IntrinsicsSystemZ.td
+++ b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
@@ -30,10 +30,6 @@ class SystemZBinaryConv<string name, LLVMType result, LLVMType arg>
 class SystemZBinary<string name, LLVMType type>
   : SystemZBinaryConv<name, type, type>;
 
-class SystemZBinaryInt<string name, LLVMType type>
-  : ClangBuiltin<"__builtin_s390_" # name>,
-    Intrinsic<[type], [type, llvm_i32_ty], [IntrNoMem]>;
-
 class SystemZBinaryConvCC<LLVMType result, LLVMType arg>
   : Intrinsic<[result, llvm_i32_ty], [arg, arg], [IntrNoMem]>;
 
@@ -131,13 +127,6 @@ multiclass SystemZBinaryBHFG<string name> : SystemZBinaryBHF<name> {
   def g : SystemZBinary<name#"g", llvm_v2i64_ty>;
 }
 
-multiclass SystemZBinaryIntBHFG<string name> {
-  def b : SystemZBinaryInt<name#"b", llvm_v16i8_ty>;
-  def h : SystemZBinaryInt<name#"h", llvm_v8i16_ty>;
-  def f : SystemZBinaryInt<name#"f", llvm_v4i32_ty>;
-  def g : SystemZBinaryInt<name#"g", llvm_v2i64_ty>;
-}
-
 multiclass SystemZBinaryCCBHF {
   def bs : SystemZBinaryCC<llvm_v16i8_ty>;
   def hs : SystemZBinaryCC<llvm_v8i16_ty>;
@@ -303,8 +292,6 @@ let TargetPrefix = "s390" in {
   defm int_s390_vmo  : SystemZBinaryExtBHF<"vmo">;
   defm int_s390_vmlo : SystemZBinaryExtBHF<"vmlo">;
 
-  defm int_s390_verllv : SystemZBinaryBHFG<"verllv">;
-  defm int_s390_verll  : SystemZBinaryIntBHFG<"verll">;
   defm int_s390_verim  : SystemZQuaternaryIntBHFG<"verim">;
 
   def int_s390_vsl   : SystemZBinary<"vsl",   llvm_v16i8_ty>;

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 4e57986206dc6..d0eb0255f7d92 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -385,16 +385,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
 
-      // Detect shifts by a scalar amount and convert them into
+      // Detect shifts/rotates by a scalar amount and convert them into
       // V*_BY_SCALAR.
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
       setOperationAction(ISD::SRL, VT, Custom);
-
-      // At present ROTL isn't matched by DAGCombiner.  ROTR should be
-      // converted into ROTL.
-      setOperationAction(ISD::ROTL, VT, Expand);
-      setOperationAction(ISD::ROTR, VT, Expand);
+      setOperationAction(ISD::ROTL, VT, Custom);
 
       // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
       // and inverting the result as necessary.
@@ -5979,6 +5975,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
   case ISD::SRA:
     return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
+  case ISD::ROTL:
+    return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
   case ISD::IS_FPCLASS:
     return lowerIS_FPCLASS(Op, DAG);
   case ISD::GET_ROUNDING:
@@ -6143,6 +6141,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(VSHL_BY_SCALAR);
     OPCODE(VSRL_BY_SCALAR);
     OPCODE(VSRA_BY_SCALAR);
+    OPCODE(VROTL_BY_SCALAR);
     OPCODE(VSUM);
     OPCODE(VICMPE);
     OPCODE(VICMPH);

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index fd951b935702a..40fe433f816fa 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -215,11 +215,12 @@ enum NodeType : unsigned {
   UNPACK_LOW,
   UNPACKL_LOW,
 
-  // Shift each element of vector operand 0 by the number of bits specified
-  // by scalar operand 1.
+  // Shift/rotate each element of vector operand 0 by the number of bits
+  // specified by scalar operand 1.
   VSHL_BY_SCALAR,
   VSRL_BY_SCALAR,
   VSRA_BY_SCALAR,
+  VROTL_BY_SCALAR,
 
   // For each element of the output type, sum across all sub-elements of
   // operand 0 belonging to the corresponding element, and add in the

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 82863d7838a95..37d6945dc7a05 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -732,21 +732,17 @@ let Predicates = [FeatureVector] in {
 
   // Element rotate left logical (with vector shift amount).
   def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
-  def VERLLVB : BinaryVRRc<"verllvb", 0xE773, int_s390_verllvb,
-                           v128b, v128b, 0>;
-  def VERLLVH : BinaryVRRc<"verllvh", 0xE773, int_s390_verllvh,
-                           v128h, v128h, 1>;
-  def VERLLVF : BinaryVRRc<"verllvf", 0xE773, int_s390_verllvf,
-                           v128f, v128f, 2>;
-  def VERLLVG : BinaryVRRc<"verllvg", 0xE773, int_s390_verllvg,
-                           v128g, v128g, 3>;
+  def VERLLVB : BinaryVRRc<"verllvb", 0xE773, rotl, v128b, v128b, 0>;
+  def VERLLVH : BinaryVRRc<"verllvh", 0xE773, rotl, v128h, v128h, 1>;
+  def VERLLVF : BinaryVRRc<"verllvf", 0xE773, rotl, v128f, v128f, 2>;
+  def VERLLVG : BinaryVRRc<"verllvg", 0xE773, rotl, v128g, v128g, 3>;
 
   // Element rotate left logical (with scalar shift amount).
   def VERLL  : BinaryVRSaGeneric<"verll", 0xE733>;
-  def VERLLB : BinaryVRSa<"verllb", 0xE733, int_s390_verllb, v128b, v128b, 0>;
-  def VERLLH : BinaryVRSa<"verllh", 0xE733, int_s390_verllh, v128h, v128h, 1>;
-  def VERLLF : BinaryVRSa<"verllf", 0xE733, int_s390_verllf, v128f, v128f, 2>;
-  def VERLLG : BinaryVRSa<"verllg", 0xE733, int_s390_verllg, v128g, v128g, 3>;
+  def VERLLB : BinaryVRSa<"verllb", 0xE733, z_vrotl_by_scalar, v128b, v128b, 0>;
+  def VERLLH : BinaryVRSa<"verllh", 0xE733, z_vrotl_by_scalar, v128h, v128h, 1>;
+  def VERLLF : BinaryVRSa<"verllf", 0xE733, z_vrotl_by_scalar, v128f, v128f, 2>;
+  def VERLLG : BinaryVRSa<"verllg", 0xE733, z_vrotl_by_scalar, v128g, v128g, 3>;
 
   // Element rotate and insert under mask.
   def VERIM  : QuaternaryVRIdGeneric<"verim", 0xE772>;

diff  --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 6713cac2a7807..4f0f23fe3ef8e 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -324,6 +324,8 @@ def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
+def z_vrotl_by_scalar   : SDNode<"SystemZISD::VROTL_BY_SCALAR",
+                                 SDT_ZVecBinaryInt>;
 def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>;
 def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
 def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;

diff  --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
index 5338ccc9b4292..e69dc9d009a54 100644
--- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
@@ -94,14 +94,6 @@ declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>)
 declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>)
 declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>)
 declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>)
-declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>)
-declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>)
-declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32)
-declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32)
-declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32)
-declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32)
 declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
 declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
 declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
@@ -1487,117 +1479,6 @@ define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) {
   ret <2 x i64> %res
 }
 
-; VERLLVB.
-define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_verllvb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllvb %v24, %v24, %v26
-; CHECK-NEXT:    br %r14
-  %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %res
-}
-
-; VERLLVH.
-define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_verllvh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllvh %v24, %v24, %v26
-; CHECK-NEXT:    br %r14
-  %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i16> %res
-}
-
-; VERLLVF.
-define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_verllvf:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllvf %v24, %v24, %v26
-; CHECK-NEXT:    br %r14
-  %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %res
-}
-
-; VERLLVG.
-define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_verllvg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllvg %v24, %v24, %v26
-; CHECK-NEXT:    br %r14
-  %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %res
-}
-
-; VERLLB.
-define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) {
-; CHECK-LABEL: test_verllb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllb %v24, %v24, 0(%r2)
-; CHECK-NEXT:    br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b)
-  ret <16 x i8> %res
-}
-
-; VERLLH.
-define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) {
-; CHECK-LABEL: test_verllh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllh %v24, %v24, 0(%r2)
-; CHECK-NEXT:    br %r14
-  %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b)
-  ret <8 x i16> %res
-}
-
-; VERLLF.
-define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: test_verllf:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllf %v24, %v24, 0(%r2)
-; CHECK-NEXT:    br %r14
-  %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b)
-  ret <4 x i32> %res
-}
-
-; VERLLG.
-define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) {
-; CHECK-LABEL: test_verllg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllg %v24, %v24, 0(%r2)
-; CHECK-NEXT:    br %r14
-  %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b)
-  ret <2 x i64> %res
-}
-
-; VERLLB with the smallest count.
-define <16 x i8> @test_verllb_1(<16 x i8> %a) {
-; CHECK-LABEL: test_verllb_1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllb %v24, %v24, 1
-; CHECK-NEXT:    br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1)
-  ret <16 x i8> %res
-}
-
-; VERLLB with the largest count.
-define <16 x i8> @test_verllb_4095(<16 x i8> %a) {
-; CHECK-LABEL: test_verllb_4095:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    verllb %v24, %v24, 4095
-; CHECK-NEXT:    br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095)
-  ret <16 x i8> %res
-}
-
-; VERLLB with the largest count + 1.
-define <16 x i8> @test_verllb_4096(<16 x i8> %a) {
-; CHECK-LABEL: test_verllb_4096:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lhi %r1, 4096
-; CHECK-NEXT:    verllb %v24, %v24, 0(%r1)
-; CHECK-NEXT:    br %r14
-  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096)
-  ret <16 x i8> %res
-}
-
 ; VERIMB.
 define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: test_verimb:
@@ -1888,7 +1769,7 @@ define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vtm %v24, %v26
 ; CHECK-NEXT:    bler %r14
-; CHECK-NEXT:  .LBB151_1: # %store
+; CHECK-NEXT:  .LBB140_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
@@ -1937,7 +1818,7 @@ define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vceqbs %v24, %v24, %v26
 ; CHECK-NEXT:    bor %r14
-; CHECK-NEXT:  .LBB154_1: # %store
+; CHECK-NEXT:  .LBB143_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
@@ -1988,7 +1869,7 @@ define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vceqhs %v24, %v24, %v26
 ; CHECK-NEXT:    ber %r14
-; CHECK-NEXT:  .LBB157_1: # %store
+; CHECK-NEXT:  .LBB146_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                            ptr %ptr) {
@@ -2040,7 +1921,7 @@ define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vceqfs %v24, %v24, %v26
 ; CHECK-NEXT:    bler %r14
-; CHECK-NEXT:  .LBB160_1: # %store
+; CHECK-NEXT:  .LBB149_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                          ptr %ptr) {
@@ -2092,7 +1973,7 @@ define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vceqgs %v24, %v24, %v26
 ; CHECK-NEXT:    bnher %r14
-; CHECK-NEXT:  .LBB163_1: # %store
+; CHECK-NEXT:  .LBB152_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
@@ -2143,7 +2024,7 @@ define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchbs %v24, %v24, %v26
 ; CHECK-NEXT:    bor %r14
-; CHECK-NEXT:  .LBB166_1: # %store
+; CHECK-NEXT:  .LBB155_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
@@ -2194,7 +2075,7 @@ define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchhs %v24, %v24, %v26
 ; CHECK-NEXT:    ber %r14
-; CHECK-NEXT:  .LBB169_1: # %store
+; CHECK-NEXT:  .LBB158_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                           ptr %ptr) {
@@ -2246,7 +2127,7 @@ define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchfs %v24, %v24, %v26
 ; CHECK-NEXT:    bler %r14
-; CHECK-NEXT:  .LBB172_1: # %store
+; CHECK-NEXT:  .LBB161_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
@@ -2297,7 +2178,7 @@ define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchgs %v24, %v24, %v26
 ; CHECK-NEXT:    bnher %r14
-; CHECK-NEXT:  .LBB175_1: # %store
+; CHECK-NEXT:  .LBB164_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
@@ -2348,7 +2229,7 @@ define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchlbs %v24, %v24, %v26
 ; CHECK-NEXT:    bor %r14
-; CHECK-NEXT:  .LBB178_1: # %store
+; CHECK-NEXT:  .LBB167_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
@@ -2399,7 +2280,7 @@ define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchlhs %v24, %v24, %v26
 ; CHECK-NEXT:    ber %r14
-; CHECK-NEXT:  .LBB181_1: # %store
+; CHECK-NEXT:  .LBB170_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                            ptr %ptr) {
@@ -2451,7 +2332,7 @@ define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchlfs %v24, %v24, %v26
 ; CHECK-NEXT:    bler %r14
-; CHECK-NEXT:  .LBB184_1: # %store
+; CHECK-NEXT:  .LBB173_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                          ptr %ptr) {
@@ -2503,7 +2384,7 @@ define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, ptr %ptr) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vchlgs %v24, %v24, %v26
 ; CHECK-NEXT:    bnher %r14
-; CHECK-NEXT:  .LBB187_1: # %store
+; CHECK-NEXT:  .LBB176_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
   %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
@@ -3450,7 +3331,7 @@ define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfcedbs %v24, %v24, %v26
 ; CHECK-NEXT:    bor %r14
-; CHECK-NEXT:  .LBB260_1: # %store
+; CHECK-NEXT:  .LBB249_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                          ptr %ptr) {
@@ -3505,7 +3386,7 @@ define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfchdbs %v24, %v24, %v26
 ; CHECK-NEXT:    ber %r14
-; CHECK-NEXT:  .LBB263_1: # %store
+; CHECK-NEXT:  .LBB252_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                             ptr %ptr) {
@@ -3560,7 +3441,7 @@ define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfchedbs %v24, %v24, %v26
 ; CHECK-NEXT:    bler %r14
-; CHECK-NEXT:  .LBB266_1: # %store
+; CHECK-NEXT:  .LBB255_1: # %store
 ; CHECK-NEXT:    mvhi 0(%r2), 0
 ; CHECK-NEXT:    br %r14
                                            ptr %ptr) {

diff  --git a/llvm/test/CodeGen/SystemZ/vec-rot-01.ll b/llvm/test/CodeGen/SystemZ/vec-rot-01.ll
new file mode 100644
index 0000000000000..fae20350f3caf
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-rot-01.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Test vector rotate left instructions with vector rotate amount.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+; Test a v16i8 rotate left.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val, <16 x i8> %amt) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvb %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %inv = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8,
+                        i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %amt
+  %parta = shl <16 x i8> %val, %amt
+  %partb = lshr <16 x i8> %val, %inv
+
+  %rotl = or <16 x i8> %parta, %partb
+
+  ret <16 x i8> %rotl
+}
+
+; Test a v16i8 rotate left (matched from fshl).
+define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val, <16 x i8> %amt) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvb %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %rotl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %val, <16 x i8> %val, <16 x i8> %amt)
+
+  ret <16 x i8> %rotl
+}
+
+; Test a v8i16 rotate left.
+define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val, <8 x i16> %amt) {
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvh %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %inv = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16,
+                        i16 16, i16 16, i16 16, i16 16>, %amt
+  %parta = shl <8 x i16> %val, %amt
+  %partb = lshr <8 x i16> %val, %inv
+
+  %rotl = or <8 x i16> %parta, %partb
+
+  ret <8 x i16> %rotl
+}
+
+; Test a v8i16 rotate left (matched from fshl).
+define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val, <8 x i16> %amt) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvh %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %rotl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %val, <8 x i16> %val, <8 x i16> %amt)
+
+  ret <8 x i16> %rotl
+}
+
+; Test a v4i32 rotate left.
+define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val, <4 x i32> %amt) {
+; CHECK-LABEL: f5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvf %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %inv = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %amt
+  %parta = shl <4 x i32> %val, %amt
+  %partb = lshr <4 x i32> %val, %inv
+
+  %rotl = or <4 x i32> %parta, %partb
+
+  ret <4 x i32> %rotl
+}
+
+; Test a v4i32 rotate left (matched from fshl).
+define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val, <4 x i32> %amt) {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvf %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %rotl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %val, <4 x i32> %val, <4 x i32> %amt)
+
+  ret <4 x i32> %rotl
+}
+
+; Test a v2i64 rotate left.
+define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val, <2 x i64> %amt) {
+; CHECK-LABEL: f7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvg %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %inv = sub <2 x i64> <i64 64, i64 64>, %amt
+  %parta = shl <2 x i64> %val, %amt
+  %partb = lshr <2 x i64> %val, %inv
+
+  %rotl = or <2 x i64> %parta, %partb
+
+  ret <2 x i64> %rotl
+}
+
+; Test a v2i64 rotate left (matched from fshl).
+define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val, <2 x i64> %amt) {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllvg %v24, %v26, %v28
+; CHECK-NEXT:    br %r14
+
+  %rotl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %val, <2 x i64> %val, <2 x i64> %amt)
+
+  ret <2 x i64> %rotl
+}

diff  --git a/llvm/test/CodeGen/SystemZ/vec-rot-02.ll b/llvm/test/CodeGen/SystemZ/vec-rot-02.ll
new file mode 100644
index 0000000000000..1331c6290af17
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-rot-02.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Test vector rotate left instructions with scalar rotate amount.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+; Test a v16i8 rotate left.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val, i32 %scalar) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllb %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %scalar_tmp = trunc i32 %scalar to i8
+  %tmp = insertelement <16 x i8> undef, i8 %scalar_tmp, i32 0
+  %amt = shufflevector <16 x i8> %tmp, <16 x i8> undef,
+                       <16 x i32> zeroinitializer
+
+  %inv = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8,
+                        i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %amt
+  %parta = shl <16 x i8> %val, %amt
+  %partb = lshr <16 x i8> %val, %inv
+
+  %rotl = or <16 x i8> %parta, %partb
+
+  ret <16 x i8> %rotl
+}
+
+; Test a v16i8 rotate left (matched from fshl).
+define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val, i32 %scalar) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllb %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %scalar_tmp = trunc i32 %scalar to i8
+  %tmp = insertelement <16 x i8> undef, i8 %scalar_tmp, i32 0
+  %amt = shufflevector <16 x i8> %tmp, <16 x i8> undef,
+                       <16 x i32> zeroinitializer
+
+  %rotl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %val, <16 x i8> %val, <16 x i8> %amt)
+
+  ret <16 x i8> %rotl
+}
+
+; Test a v8i16 rotate left.
+define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val, i32 %scalar) {
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllh %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %scalar_tmp = trunc i32 %scalar to i16
+  %tmp = insertelement <8 x i16> undef, i16 %scalar_tmp, i32 0
+  %amt = shufflevector <8 x i16> %tmp, <8 x i16> undef,
+                       <8 x i32> zeroinitializer
+
+  %inv = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16,
+                        i16 16, i16 16, i16 16, i16 16>, %amt
+  %parta = shl <8 x i16> %val, %amt
+  %partb = lshr <8 x i16> %val, %inv
+
+  %rotl = or <8 x i16> %parta, %partb
+
+  ret <8 x i16> %rotl
+}
+
+; Test a v8i16 rotate left (matched from fshl).
+define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val, i32 %scalar) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllh %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %scalar_tmp = trunc i32 %scalar to i16
+  %tmp = insertelement <8 x i16> undef, i16 %scalar_tmp, i32 0
+  %amt = shufflevector <8 x i16> %tmp, <8 x i16> undef,
+                       <8 x i32> zeroinitializer
+
+  %rotl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %val, <8 x i16> %val, <8 x i16> %amt)
+
+  ret <8 x i16> %rotl
+}
+
+; Test a v4i32 rotate left.
+define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val, i32 %scalar) {
+; CHECK-LABEL: f5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllf %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %tmp = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %amt = shufflevector <4 x i32> %tmp, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+
+  %inv = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %amt
+  %parta = shl <4 x i32> %val, %amt
+  %partb = lshr <4 x i32> %val, %inv
+
+  %rotl = or <4 x i32> %parta, %partb
+
+  ret <4 x i32> %rotl
+}
+
+; Test a v4i32 rotate left (matched from fshl).
+define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val, i32 %scalar) {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllf %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %tmp = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %amt = shufflevector <4 x i32> %tmp, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+
+  %rotl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %val, <4 x i32> %val, <4 x i32> %amt)
+
+  ret <4 x i32> %rotl
+}
+
+; Test a v2i64 rotate left.
+define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val, i32 %scalar) {
+; CHECK-LABEL: f7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllg %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %scalar_tmp = zext i32 %scalar to i64
+  %tmp = insertelement <2 x i64> undef, i64 %scalar_tmp, i32 0
+  %amt = shufflevector <2 x i64> %tmp, <2 x i64> undef,
+                       <2 x i32> zeroinitializer
+
+  %inv = sub <2 x i64> <i64 64, i64 64>, %amt
+  %parta = shl <2 x i64> %val, %amt
+  %partb = lshr <2 x i64> %val, %inv
+
+  %rotl = or <2 x i64> %parta, %partb
+
+  ret <2 x i64> %rotl
+}
+
+; Test a v2i64 rotate left (matched from fshl).
+define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val, i32 %scalar) {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllg %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %scalar_tmp = zext i32 %scalar to i64
+  %tmp = insertelement <2 x i64> undef, i64 %scalar_tmp, i32 0
+  %amt = shufflevector <2 x i64> %tmp, <2 x i64> undef,
+                       <2 x i32> zeroinitializer
+
+  %rotl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %val, <2 x i64> %val, <2 x i64> %amt)
+
+  ret <2 x i64> %rotl
+}
+
+; Test a v2i64 rotate left (matched from fshl).
+define <2 x i64> @f9(<2 x i64> %dummy, <2 x i64> %val, i64 %scalar) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    verllg %v24, %v26, 0(%r2)
+; CHECK-NEXT:    br %r14
+
+  %tmp = insertelement <2 x i64> undef, i64 %scalar, i32 0
+  %amt = shufflevector <2 x i64> %tmp, <2 x i64> undef,
+                       <2 x i32> zeroinitializer
+
+  %rotl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %val, <2 x i64> %val, <2 x i64> %amt)
+
+  ret <2 x i64> %rotl
+}