[llvm] 254b4f2 - [ARM,MVE] Add intrinsics for scalar shifts.

Tue Nov 19 06:48:14 PST 2019

Author: Simon Tatham
Date: 2019-11-19T14:47:29Z
New Revision: 254b4f250007ef9f2d2377eb912963beafa39754

URL: https://github.com/llvm/llvm-project/commit/254b4f250007ef9f2d2377eb912963beafa39754
DIFF: https://github.com/llvm/llvm-project/commit/254b4f250007ef9f2d2377eb912963beafa39754.diff

LOG: [ARM,MVE] Add intrinsics for scalar shifts.

This fills in the small family of MVE intrinsics that have nothing to
do with vectors: they implement bit-shift operations on 32- or 64-bit
values held in one or two general-purpose registers. Most of these
shift operations saturate if shifting left, and round to nearest if
shifting right, although LSLL and ASRL behave like ordinary shifts.

When these instructions take a variable shift count in a register,
they pay attention to its sign, so that (for example) LSLL or UQRSHLL
will shift left if given a positive number but right if given a
negative one. That makes even LSLL and ASRL different enough from
standard LLVM IR shift semantics that I couldn't see any better
alternative than to simply model the whole family as a set of
MVE-specific IR intrinsics.

(The //immediate// forms of LSLL and ASRL, on the other hand, do
behave exactly like a standard IR shift of a 64-bit value. In fact,
those forms don't have ACLE intrinsics defined at all, because you can
just write an ordinary C shift operation if you want one of those.)

The 64-bit shifts have to be instruction-selected in C++, because they
deliver two output values. But the 32-bit ones are simple enough that
I could write a DAG isel pattern directly into each Instruction
record.

Reviewers: ostannard, MarkMurrayARM, dmgreen

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D70319

Added: 
    

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/include/clang/Basic/arm_mve_defs.td
    clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
    llvm/include/llvm/IR/IntrinsicsARM.td
    llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index e227d95f9735..d8d199f464d9 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -388,13 +388,56 @@ defm vstrhq: scatter_offset_both<!listconcat(T.All16, T.Int32), u16, 1>;
 defm vstrwq: scatter_offset_both<T.All32, u32, 2>;
 defm vstrdq: scatter_offset_both<T.Int64, u64, 3>;
 
-let params = [Void], pnt = PNT_None in
-def urshrl: Intrinsic<u64, (args u64:$value, imm_1to32:$shift),
-                      (seq (u32 (lshr $value, (u64 32))):$hi,
-                           (u32 $value):$lo,
-                           (IRInt<"urshrl"> $lo, $hi, $shift):$pair,
-                           (or (shl (u64 (xval $pair, 1)), (u64 32)),
-                               (u64 (xval $pair, 0))))>;
+// Base class for the scalar shift intrinsics.
+class ScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
+  Intrinsic<argtype, !con((args argtype:$value), shiftCountArg), shiftCodeGen> {
+  let params = [Void];
+  let pnt = PNT_None;
+}
+
+// Subclass that includes the machinery to take a 64-bit input apart
+// into halves, retrieve the two halves of a shifted output as a pair,
+// and glue the pieces of the pair back into an i64 for output.
+class LongScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
+   ScalarShift<argtype, shiftCountArg,
+               (seq (u32 (lshr $value, (argtype 32))):$hi,
+                    (u32 $value):$lo,
+                    shiftCodeGen:$pair,
+                    (or (shl (u64 (xval $pair, 1)), (u64 32)),
+                             (u64 (xval $pair, 0))))>;
+
+// The family of saturating/rounding scalar shifts that take an
+// immediate shift count. They come in matched 32- and 64-bit pairs.
+multiclass ScalarSaturatingShiftImm<Type arg32, Type arg64> {
+  def "": ScalarShift<arg32, (args imm_1to32:$sh),
+                      (IRInt<NAME> $value, $sh)>;
+  def l:  LongScalarShift<arg64, (args imm_1to32:$sh),
+                          (IRInt<NAME # "l"> $lo, $hi, $sh)>;
+}
+defm uqshl: ScalarSaturatingShiftImm<u32, u64>;
+defm urshr: ScalarSaturatingShiftImm<u32, u64>;
+defm sqshl: ScalarSaturatingShiftImm<s32, s64>;
+defm srshr: ScalarSaturatingShiftImm<s32, s64>;
+
+// The family of saturating/rounding scalar shifts that take a
+// register shift count. They also have 32- and 64-bit forms, but the
+// 64-bit form also has a version that saturates to 48 bits, so the IR
+// intrinsic takes an extra saturation-type operand.
+multiclass ScalarSaturatingShiftReg<Type arg32, Type arg64> {
+  def "":          ScalarShift<arg32, (args s32:$sh),
+                               (IRInt<NAME> $value, $sh)>;
+  def l:       LongScalarShift<arg64, (args s32:$sh),
+                               (IRInt<NAME # "l"> $lo, $hi, $sh, 64)>;
+  def l_sat48: LongScalarShift<arg64, (args s32:$sh),
+                               (IRInt<NAME # "l"> $lo, $hi, $sh, 48)>;
+}
+defm uqrshl: ScalarSaturatingShiftReg<u32, u64>;
+defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
+
+// The intrinsics for LSLL and ASRL come in 64-bit versions only, with
+// no saturation count.
+def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
+def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
 
 let params = T.Int32 in {
 def vadcq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),

diff  --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index a4ba4ed87de3..27cdada02ec4 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -312,7 +312,7 @@ def imm_lane : Immediate<sint, IB_LaneIndex>;
 
 // imm_1to32 can be in the range 1 to 32, unconditionally. (e.g. scalar shift
 // intrinsics)
-def imm_1to32 : Immediate<u32, IB_ConstRange<1, 32>>;
+def imm_1to32 : Immediate<sint, IB_ConstRange<1, 32>>;
 
 // imm_1248 can be 1, 2, 4 or 8. (e.g. vidupq)
 def imm_1248 : Immediate<u32, IB_ConstRange<1, 8>> {

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
index 12e8f1195743..a6425e7d93f3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
@@ -3,6 +3,237 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_asrl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.asrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_asrl(int64_t value, int32_t shift)
+{
+    return asrl(value, shift);
+}
+
+// CHECK-LABEL: @test_lsll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.lsll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_lsll(uint64_t value, int32_t shift)
+{
+    return lsll(value, shift);
+}
+
+// CHECK-LABEL: @test_sqrshr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.sqrshr(i32 [[VALUE:%.*]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_sqrshr(int32_t value, int32_t shift)
+{
+    return sqrshr(value, shift);
+}
+
+// CHECK-LABEL: @test_sqrshrl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 64)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_sqrshrl(int64_t value, int32_t shift)
+{
+    return sqrshrl(value, shift);
+}
+
+// CHECK-LABEL: @test_sqrshrl_sat48(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_sqrshrl_sat48(int64_t value, int32_t shift)
+{
+    return sqrshrl_sat48(value, shift);
+}
+
+// CHECK-LABEL: @test_sqshl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.sqshl(i32 [[VALUE:%.*]], i32 2)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_sqshl(int32_t value)
+{
+    return sqshl(value, 2);
+}
+
+// CHECK-LABEL: @test_sqshll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqshll(i32 [[TMP2]], i32 [[TMP1]], i32 17)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_sqshll(int64_t value)
+{
+    return sqshll(value, 17);
+}
+
+// CHECK-LABEL: @test_srshr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.srshr(i32 [[VALUE:%.*]], i32 6)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_srshr(int32_t value)
+{
+    return srshr(value, 6);
+}
+
+// CHECK-LABEL: @test_srshrl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.srshrl(i32 [[TMP2]], i32 [[TMP1]], i32 26)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_srshrl(int64_t value)
+{
+    return srshrl(value, 26);
+}
+
+// CHECK-LABEL: @test_uqrshl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.uqrshl(i32 [[VALUE:%.*]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_uqrshl(uint32_t value, int32_t shift)
+{
+    return uqrshl(value, shift);
+}
+
+// CHECK-LABEL: @test_uqrshll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 64)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_uqrshll(uint64_t value, int32_t shift)
+{
+    return uqrshll(value, shift);
+}
+
+// CHECK-LABEL: @test_uqrshll_sat48(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_uqrshll_sat48(uint64_t value, int32_t shift)
+{
+    return uqrshll_sat48(value, shift);
+}
+
+// CHECK-LABEL: @test_uqshl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.uqshl(i32 [[VALUE:%.*]], i32 21)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_uqshl(uint32_t value)
+{
+    return uqshl(value, 21);
+}
+
+// CHECK-LABEL: @test_uqshll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqshll(i32 [[TMP2]], i32 [[TMP1]], i32 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_uqshll(uint64_t value)
+{
+    return uqshll(value, 16);
+}
+
+// CHECK-LABEL: @test_urshr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.urshr(i32 [[VALUE:%.*]], i32 22)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_urshr(uint32_t value)
+{
+    return urshr(value, 22);
+}
+
 // CHECK-LABEL: @test_urshrl(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32

diff  --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 0ee56a2b1c18..10417411edca 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -850,9 +850,25 @@ defm int_arm_mve_vstr_scatter_offset: MVEPredicated<
    [], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty,
    llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>;
 
-def int_arm_mve_urshrl: Intrinsic<
-   [llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrNoMem]>;
+// MVE scalar shifts.
+class ARM_MVE_qrshift_single<list<LLVMType> value,
+                             list<LLVMType> saturate = []> :
+  Intrinsic<value, value # [llvm_i32_ty] # saturate, [IntrNoMem]>;
+multiclass ARM_MVE_qrshift<list<LLVMType> saturate = []> {
+  // Most of these shifts come in 32- and 64-bit versions. But only
+  // the 64-bit ones have the extra saturation argument (if any).
+  def "": ARM_MVE_qrshift_single<[llvm_i32_ty]>;
+  def l:  ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty], saturate>;
+}
+defm int_arm_mve_urshr: ARM_MVE_qrshift;
+defm int_arm_mve_uqshl: ARM_MVE_qrshift;
+defm int_arm_mve_srshr: ARM_MVE_qrshift;
+defm int_arm_mve_sqshl: ARM_MVE_qrshift;
+defm int_arm_mve_uqrshl: ARM_MVE_qrshift<[llvm_i32_ty]>;
+defm int_arm_mve_sqrshr: ARM_MVE_qrshift<[llvm_i32_ty]>;
+// LSLL and ASRL only have 64-bit versions, not 32.
+def int_arm_mve_lsll: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
+def int_arm_mve_asrl: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
 
 def int_arm_mve_vadc: Intrinsic<
    [llvm_anyvector_ty, llvm_i32_ty],

diff  --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 59acc34906e4..46a2560e1674 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -226,7 +226,8 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
   void SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, bool Predicated);
 
   /// SelectMVE_LongShift - Select MVE 64-bit scalar shift intrinsics.
-  void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate);
+  void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate,
+                           bool HasSaturationOperand);
 
   /// SelectMVE_VADCSBC - Select MVE vector add/sub-with-carry intrinsics.
   void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
@@ -2399,7 +2400,8 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes,
 }
 
 void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
-                                          bool Immediate) {
+                                          bool Immediate,
+                                          bool HasSaturationOperand) {
   SDLoc Loc(N);
   SmallVector<SDValue, 8> Ops;
 
@@ -2410,11 +2412,18 @@ void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
   // The shift count
   if (Immediate) {
     int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
-    Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset
+    Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
   } else {
     Ops.push_back(N->getOperand(3));
   }
 
+  // The immediate saturation operand, if any
+  if (HasSaturationOperand) {
+    int32_t SatOp = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+    int SatBit = (SatOp == 64 ? 0 : 1);
+    Ops.push_back(getI32Imm(SatBit, Loc));
+  }
+
   // MVE scalar shifts are IT-predicable, so include the standard
   // predicate arguments.
   Ops.push_back(getAL(CurDAG, Loc));
@@ -4267,7 +4276,28 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       break;
 
     case Intrinsic::arm_mve_urshrl:
-      SelectMVE_LongShift(N, ARM::MVE_URSHRL, true);
+      SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false);
+      return;
+    case Intrinsic::arm_mve_uqshll:
+      SelectMVE_LongShift(N, ARM::MVE_UQSHLL, true, false);
+      return;
+    case Intrinsic::arm_mve_srshrl:
+      SelectMVE_LongShift(N, ARM::MVE_SRSHRL, true, false);
+      return;
+    case Intrinsic::arm_mve_sqshll:
+      SelectMVE_LongShift(N, ARM::MVE_SQSHLL, true, false);
+      return;
+    case Intrinsic::arm_mve_uqrshll:
+      SelectMVE_LongShift(N, ARM::MVE_UQRSHLL, false, true);
+      return;
+    case Intrinsic::arm_mve_sqrshrl:
+      SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true);
+      return;
+    case Intrinsic::arm_mve_lsll:
+      SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false);
+      return;
+    case Intrinsic::arm_mve_asrl:
+      SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false);
       return;
 
     case Intrinsic::arm_mve_vadc:

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index c14627826e48..68aa6930a13d 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -409,9 +409,12 @@ class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr,
   let Inst{19-16} = RdaDest{3-0};
 }
 
-class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4, list<dag> pattern=[]>
+class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4>
   : MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, long_shift:$imm),
-                     "$RdaSrc, $imm", "$RdaDest = $RdaSrc", pattern> {
+                     "$RdaSrc, $imm", "$RdaDest = $RdaSrc",
+                     [(set rGPR:$RdaDest,
+                          (i32 (!cast<Intrinsic>("int_arm_mve_" # iname)
+                                    (i32 rGPR:$RdaSrc), (i32 imm:$imm))))]> {
   bits<5> imm;
 
   let Inst{15} = 0b0;
@@ -427,9 +430,12 @@ def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>;
 def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>;
 def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>;
 
-class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]>
+class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4>
   : MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, rGPR:$Rm),
-                     "$RdaSrc, $Rm", "$RdaDest = $RdaSrc", pattern> {
+                     "$RdaSrc, $Rm", "$RdaDest = $RdaSrc",
+                     [(set rGPR:$RdaDest,
+                         (i32 (!cast<Intrinsic>("int_arm_mve_" # iname)
+                                   (i32 rGPR:$RdaSrc), (i32 rGPR:$Rm))))]> {
   bits<4> Rm;
 
   let Inst{15-12} = Rm{3-0};

diff  --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll
index c451ade15741..14326a9efabe 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scalar-shifts.ll
@@ -1,7 +1,264 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
 
-define arm_aapcs_vfpcc i64 @test_urshrl(i64 %value) {
+define i64 @test_asrl(i64 %value, i32 %shift) {
+; CHECK-LABEL: test_asrl:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    asrl r0, r1, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.asrl(i32 %2, i32 %1, i32 %shift)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.asrl(i32, i32, i32)
+
+define i64 @test_lsll(i64 %value, i32 %shift) {
+; CHECK-LABEL: test_lsll:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsll r0, r1, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.lsll(i32 %2, i32 %1, i32 %shift)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.lsll(i32, i32, i32)
+
+define i32 @test_sqrshr(i32 %value, i32 %shift) {
+; CHECK-LABEL: test_sqrshr:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sqrshr r0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.sqrshr(i32 %value, i32 %shift)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.sqrshr(i32, i32)
+
+define i64 @test_sqrshrl(i64 %value, i32 %shift) {
+; CHECK-LABEL: test_sqrshrl:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sqrshrl r0, r1, #64, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 %2, i32 %1, i32 %shift, i32 64)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
+
+define i64 @test_sqrshrl_sat48(i64 %value, i32 %shift) {
+; CHECK-LABEL: test_sqrshrl_sat48:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sqrshrl r0, r1, #48, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 %2, i32 %1, i32 %shift, i32 48)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define i32 @test_sqshl(i32 %value) {
+; CHECK-LABEL: test_sqshl:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sqshl r0, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.sqshl(i32 %value, i32 2)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.sqshl(i32, i32)
+
+define i64 @test_sqshll(i64 %value) {
+; CHECK-LABEL: test_sqshll:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sqshll r0, r1, #17
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.sqshll(i32 %2, i32 %1, i32 17)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.sqshll(i32, i32, i32)
+
+define i32 @test_srshr(i32 %value) {
+; CHECK-LABEL: test_srshr:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    srshr r0, #6
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.srshr(i32 %value, i32 6)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.srshr(i32, i32)
+
+define i64 @test_srshrl(i64 %value) {
+; CHECK-LABEL: test_srshrl:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    srshrl r0, r1, #26
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.srshrl(i32 %2, i32 %1, i32 26)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.srshrl(i32, i32, i32)
+
+define i32 @test_uqrshl(i32 %value, i32 %shift) {
+; CHECK-LABEL: test_uqrshl:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    uqrshl r0, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.uqrshl(i32 %value, i32 %shift)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.uqrshl(i32, i32)
+
+define i64 @test_uqrshll(i64 %value, i32 %shift) {
+; CHECK-LABEL: test_uqrshll:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    uqrshll r0, r1, #64, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 %2, i32 %1, i32 %shift, i32 64)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.uqrshll(i32, i32, i32, i32)
+
+define i64 @test_uqrshll_sat48(i64 %value, i32 %shift) {
+; CHECK-LABEL: test_uqrshll_sat48:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    uqrshll r0, r1, #48, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 %2, i32 %1, i32 %shift, i32 48)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+define i32 @test_uqshl(i32 %value) {
+; CHECK-LABEL: test_uqshl:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    uqshl r0, #21
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.uqshl(i32 %value, i32 21)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.uqshl(i32, i32)
+
+define i64 @test_uqshll(i64 %value) {
+; CHECK-LABEL: test_uqshll:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    uqshll r0, r1, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = lshr i64 %value, 32
+  %1 = trunc i64 %0 to i32
+  %2 = trunc i64 %value to i32
+  %3 = call { i32, i32 } @llvm.arm.mve.uqshll(i32 %2, i32 %1, i32 16)
+  %4 = extractvalue { i32, i32 } %3, 1
+  %5 = zext i32 %4 to i64
+  %6 = shl i64 %5, 32
+  %7 = extractvalue { i32, i32 } %3, 0
+  %8 = zext i32 %7 to i64
+  %9 = or i64 %6, %8
+  ret i64 %9
+}
+
+declare { i32, i32 } @llvm.arm.mve.uqshll(i32, i32, i32)
+
+define i32 @test_urshr(i32 %value) {
+; CHECK-LABEL: test_urshr:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    urshr r0, #22
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.arm.mve.urshr(i32 %value, i32 22)
+  ret i32 %0
+}
+
+declare i32 @llvm.arm.mve.urshr(i32, i32)
+
+define i64 @test_urshrl(i64 %value) {
 ; CHECK-LABEL: test_urshrl:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    urshrl r0, r1, #6
@@ -10,10 +267,10 @@ entry:
   %0 = lshr i64 %value, 32
   %1 = trunc i64 %0 to i32
   %2 = trunc i64 %value to i32
-  %3 = tail call { i32, i32 } @llvm.arm.mve.urshrl(i32 %2, i32 %1, i32 6)
+  %3 = call { i32, i32 } @llvm.arm.mve.urshrl(i32 %2, i32 %1, i32 6)
   %4 = extractvalue { i32, i32 } %3, 1
   %5 = zext i32 %4 to i64
-  %6 = shl nuw i64 %5, 32
+  %6 = shl i64 %5, 32
   %7 = extractvalue { i32, i32 } %3, 0
   %8 = zext i32 %7 to i64
   %9 = or i64 %6, %8