[llvm] f8c5a4c - [WebAssembly] Optimize out shift masks

Wed Jul 7 23:14:43 PDT 2021

Author: Thomas Lively
Date: 2021-07-07T23:14:31-07:00
New Revision: f8c5a4c67075877e1b6976bb7372aa96f02c11bc

URL: https://github.com/llvm/llvm-project/commit/f8c5a4c67075877e1b6976bb7372aa96f02c11bc
DIFF: https://github.com/llvm/llvm-project/commit/f8c5a4c67075877e1b6976bb7372aa96f02c11bc.diff

LOG: [WebAssembly] Optimize out shift masks

WebAssembly's shift instructions implicitly masks the shift count, so optimize
out redundant explicit masks of the shift count. For vector shifts, this
currently only works if the mask is applied before splatting the shift count,
but this should be addressed in a future commit. Resolves PR49655.

Differential Revision: https://reviews.llvm.org/D105600

Added: 
    llvm/test/CodeGen/WebAssembly/masked-shifts.ll

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
    llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 18250cf8ef850..7a0c524d63b0d 100644

--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -93,6 +93,14 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
+// Optimize away an explicit mask on a shift count.
+def : Pat<(shl I32:$lhs, (and I32:$rhs, 31)), (SHL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(sra I32:$lhs, (and I32:$rhs, 31)), (SHR_S_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(srl I32:$lhs, (and I32:$rhs, 31)), (SHR_U_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(shl I64:$lhs, (and I64:$rhs, 63)), (SHL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(sra I64:$lhs, (and I64:$rhs, 63)), (SHR_S_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(srl I64:$lhs, (and I64:$rhs, 63)), (SHR_U_I64 I64:$lhs, I64:$rhs)>;
+
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 2c35b4944fc47..d7058ff049362 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -898,6 +898,35 @@ defm SHL : SIMDShiftInt<wasm_shl, "shl", 107>;
 defm SHR_S : SIMDShiftInt<wasm_shr_s, "shr_s", 108>;
 defm SHR_U : SIMDShiftInt<wasm_shr_u, "shr_u", 109>;
 
+// Optimize away an explicit mask on a shift count.
+def : Pat<(wasm_shl (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+          (SHL_I8x16 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+          (SHR_S_I8x16 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+          (SHR_U_I8x16 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+          (SHL_I16x8 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+          (SHR_S_I16x8 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+          (SHR_U_I16x8 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+          (SHL_I32x4 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+          (SHR_S_I32x4 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+          (SHR_U_I32x4 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+          (SHL_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+def : Pat<(wasm_shr_s (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+          (SHR_S_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+def : Pat<(wasm_shr_u (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+          (SHR_U_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+
 //===----------------------------------------------------------------------===//
 // Integer binary arithmetic
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
new file mode 100644
index 0000000000000..75db5e190bd22
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
@@ -0,0 +1,531 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+;; Check that masked shift counts are optimized out.
+
+;; TODO: optimize the *_late functions.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+define i32 @shl_i32(i32 %v, i32 %x) {
+; CHECK-LABEL: shl_i32:
+; CHECK:         .functype shl_i32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %a = shl i32 %v, %m
+  ret i32 %a
+}
+
+define i32 @sra_i32(i32 %v, i32 %x) {
+; CHECK-LABEL: sra_i32:
+; CHECK:         .functype sra_i32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %a = ashr i32 %v, %m
+  ret i32 %a
+}
+
+define i32 @srl_i32(i32 %v, i32 %x) {
+; CHECK-LABEL: srl_i32:
+; CHECK:         .functype srl_i32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %a = lshr i32 %v, %m
+  ret i32 %a
+}
+
+define i64 @shl_i64(i64 %v, i64 %x) {
+; CHECK-LABEL: shl_i64:
+; CHECK:         .functype shl_i64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %a = shl i64 %v, %m
+  ret i64 %a
+}
+
+define i64 @sra_i64(i64 %v, i64 %x) {
+; CHECK-LABEL: sra_i64:
+; CHECK:         .functype sra_i64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %a = ashr i64 %v, %m
+  ret i64 %a
+}
+
+define i64 @srl_i64(i64 %v, i64 %x) {
+; CHECK-LABEL: srl_i64:
+; CHECK:         .functype srl_i64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %a = lshr i64 %v, %m
+  ret i64 %a
+}
+
+define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: shl_v16i8:
+; CHECK:         .functype shl_v16i8 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i8 %x, 7
+  %t = insertelement <16 x i8> undef, i8 %m, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = shl <16 x i8> %v, %s
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @shl_v16i8_late(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: shl_v16i8_late:
+; CHECK:         .functype shl_v16i8_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.splat
+; CHECK-NEXT:    v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <16 x i8> undef, i8 %x, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <16 x i8> %s, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
+                          i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %a = shl <16 x i8> %v, %m
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @ashr_v16i8(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: ashr_v16i8:
+; CHECK:         .functype ashr_v16i8 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i8 %x, 7
+  %t = insertelement <16 x i8> undef, i8 %m, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = ashr <16 x i8> %v, %s
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @ashr_v16i8_late(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: ashr_v16i8_late:
+; CHECK:         .functype ashr_v16i8_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.splat
+; CHECK-NEXT:    v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i8x16.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <16 x i8> undef, i8 %x, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <16 x i8> %s, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
+                          i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %a = ashr <16 x i8> %v, %m
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @lshr_v16i8(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: lshr_v16i8:
+; CHECK:         .functype lshr_v16i8 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i8 %x, 7
+  %t = insertelement <16 x i8> undef, i8 %m, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = lshr <16 x i8> %v, %s
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @lshr_v16i8_late(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: lshr_v16i8_late:
+; CHECK:         .functype lshr_v16i8_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.splat
+; CHECK-NEXT:    v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i8x16.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <16 x i8> undef, i8 %x, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <16 x i8> %s, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
+                          i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %a = lshr <16 x i8> %v, %m
+  ret <16 x i8> %a
+}
+
+define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: shl_v8i16:
+; CHECK:         .functype shl_v8i16 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i16 %x, 15
+  %t = insertelement <8 x i16> undef, i16 %m, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = shl <8 x i16> %v, %s
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @shl_v8i16_late(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: shl_v8i16_late:
+; CHECK:         .functype shl_v8i16_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.splat
+; CHECK-NEXT:    v128.const 15, 15, 15, 15, 15, 15, 15, 15
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.extract_lane_u 0
+; CHECK-NEXT:    i16x8.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <8 x i16> undef, i16 %x, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <8 x i16> %s,
+    <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %a = shl <8 x i16> %v, %m
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @ashr_v8i16(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: ashr_v8i16:
+; CHECK:         .functype ashr_v8i16 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i16 %x, 15
+  %t = insertelement <8 x i16> undef, i16 %m, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = ashr <8 x i16> %v, %s
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @ashr_v8i16_late(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: ashr_v8i16_late:
+; CHECK:         .functype ashr_v8i16_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.splat
+; CHECK-NEXT:    v128.const 15, 15, 15, 15, 15, 15, 15, 15
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.extract_lane_u 0
+; CHECK-NEXT:    i16x8.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <8 x i16> undef, i16 %x, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <8 x i16> %s,
+    <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %a = ashr <8 x i16> %v, %m
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @lshr_v8i16(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: lshr_v8i16:
+; CHECK:         .functype lshr_v8i16 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i16 %x, 15
+  %t = insertelement <8 x i16> undef, i16 %m, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = lshr <8 x i16> %v, %s
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @lshr_v8i16_late(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: lshr_v8i16_late:
+; CHECK:         .functype lshr_v8i16_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.splat
+; CHECK-NEXT:    v128.const 15, 15, 15, 15, 15, 15, 15, 15
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.extract_lane_u 0
+; CHECK-NEXT:    i16x8.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <8 x i16> undef, i16 %x, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <8 x i16> %s,
+    <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %a = lshr <8 x i16> %v, %m
+  ret <8 x i16> %a
+}
+
+define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: shl_v4i32:
+; CHECK:         .functype shl_v4i32 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %t = insertelement <4 x i32> undef, i32 %m, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a = shl <4 x i32> %v, %s
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @shl_v4i32_late(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: shl_v4i32_late:
+; CHECK:         .functype shl_v4i32_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    v128.const 31, 31, 31, 31
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <4 x i32> undef, i32 %x, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %m = and <4 x i32> %s, <i32 31, i32 31, i32 31, i32 31>
+  %a = shl <4 x i32> %v, %m
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @ashr_v4i32(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: ashr_v4i32:
+; CHECK:         .functype ashr_v4i32 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %t = insertelement <4 x i32> undef, i32 %m, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a = ashr <4 x i32> %v, %s
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @ashr_v4i32_late(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: ashr_v4i32_late:
+; CHECK:         .functype ashr_v4i32_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    v128.const 31, 31, 31, 31
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <4 x i32> undef, i32 %x, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %m = and <4 x i32> %s, <i32 31, i32 31, i32 31, i32 31>
+  %a = ashr <4 x i32> %v, %m
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @lshr_v4i32(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: lshr_v4i32:
+; CHECK:         .functype lshr_v4i32 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %t = insertelement <4 x i32> undef, i32 %m, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a = lshr <4 x i32> %v, %s
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @lshr_v4i32_late(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: lshr_v4i32_late:
+; CHECK:         .functype lshr_v4i32_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    v128.const 31, 31, 31, 31
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32x4.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <4 x i32> undef, i32 %x, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %m = and <4 x i32> %s, <i32 31, i32 31, i32 31, i32 31>
+  %a = lshr <4 x i32> %v, %m
+  ret <4 x i32> %a
+}
+
+define <2 x i64> @shl_v2i64(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: shl_v2i64:
+; CHECK:         .functype shl_v2i64 (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %t = insertelement <2 x i64> undef, i64 %m, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %a = shl <2 x i64> %v, %s
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @shl_v2i64_late(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: shl_v2i64_late:
+; CHECK:         .functype shl_v2i64_late (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    v128.const 63, 63
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <2 x i64> undef, i64 %x, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %m = and <2 x i64> %s, <i64 63, i64 63>
+  %a = shl <2 x i64> %v, %m
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @ashr_v2i64(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: ashr_v2i64:
+; CHECK:         .functype ashr_v2i64 (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %t = insertelement <2 x i64> undef, i64 %m, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %a = ashr <2 x i64> %v, %s
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @ashr_v2i64_late(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: ashr_v2i64_late:
+; CHECK:         .functype ashr_v2i64_late (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    v128.const 63, 63
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <2 x i64> undef, i64 %x, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %m = and <2 x i64> %s, <i64 63, i64 63>
+  %a = ashr <2 x i64> %v, %m
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @lshr_v2i64(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: lshr_v2i64:
+; CHECK:         .functype lshr_v2i64 (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %t = insertelement <2 x i64> undef, i64 %m, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %a = lshr <2 x i64> %v, %s
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @lshr_v2i64_late(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: lshr_v2i64_late:
+; CHECK:         .functype lshr_v2i64_late (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    v128.const 63, 63
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <2 x i64> undef, i64 %x, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %m = and <2 x i64> %s, <i64 63, i64 63>
+  %a = lshr <2 x i64> %v, %m
+  ret <2 x i64> %a
+}