[clang] 8e3e56f - [WebAssembly] Add wasm-specific vector shuffle builtin and intrinsic

Mon May 11 10:02:06 PDT 2020

Author: Thomas Lively
Date: 2020-05-11T10:01:55-07:00
New Revision: 8e3e56f2a36701480eeb65e426701d5a9025cc59

URL: https://github.com/llvm/llvm-project/commit/8e3e56f2a36701480eeb65e426701d5a9025cc59
DIFF: https://github.com/llvm/llvm-project/commit/8e3e56f2a36701480eeb65e426701d5a9025cc59.diff

LOG: [WebAssembly] Add wasm-specific vector shuffle builtin and intrinsic

Summary:

Although using `__builtin_shufflevector` and the `shufflevector`
instruction works fine, they are not opaque to the optimizer. As a
result, DAGCombine can potentially reduce the number of shuffles and
change the shuffle masks. This is unexpected behavior for users of the
WebAssembly SIMD intrinsics who have crafted their shuffles to
optimize the code generated by engines. This patch solves the problem
by adding a new shuffle intrinsic that is opaque to the optimizers in
line with the decision of the WebAssembly SIMD contributors at
https://github.com/WebAssembly/simd/issues/196#issuecomment-622494748. In
the future we may implement custom DAG combines to properly optimize
shuffles and replace this solution.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, hiraditya, sunfish, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D66983

Added: 
    

Modified: 
    clang/include/clang/Basic/BuiltinsWebAssembly.def
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/wasm_simd128.h
    clang/test/CodeGen/builtins-wasm.c
    llvm/include/llvm/IR/IntrinsicsWebAssembly.td
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
    llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index cbdb33c0c352..3c7e27544856 100644

--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -119,6 +119,7 @@ TARGET_BUILTIN(__builtin_wasm_avgr_u_i8x16, "V16cV16cV16c", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8sV8sV8s", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16cV16cV16cIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16c", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128")

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c64fde719445..541dac7b7580 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16044,6 +16044,20 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()});
     return Builder.CreateCall(Callee, Vec);
   }
+  case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
+    Value *Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
+    Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
+    while (OpIdx < 18) {
+      llvm::APSInt LaneConst;
+      if (!E->getArg(OpIdx)->isIntegerConstantExpr(LaneConst, getContext()))
+        llvm_unreachable("Constant arg isn't actually constant?");
+      Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
+    }
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
+    return Builder.CreateCall(Callee, Ops);
+  }
   default:
     return nullptr;
   }

diff  --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 612aec139723..b81594efcc82 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -1020,23 +1020,31 @@ wasm_f32x4_convert_u32x4(v128_t __a) {
 #define wasm_v8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7, __c8, __c9, __c10, __c11, __c12, __c13,       \
                            __c14, __c15)                                       \
-  ((v128_t)(__builtin_shufflevector(                                           \
-      (__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
-      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
+      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))
 
 #define wasm_v16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7)                                               \
-  ((v128_t)(__builtin_shufflevector((__u16x8)(__a), (__u16x8)(__b), __c0,      \
-                                    __c1, __c2, __c3, __c4, __c5, __c6,        \
-                                    __c7)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 2, __c0 * 2 + 1, __c1 * 2,        \
+      __c1 * 2 + 1, __c2 * 2, __c2 * 2 + 1, __c3 * 2, __c3 * 2 + 1, __c4 * 2,  \
+      __c4 * 2 + 1, __c5 * 2, __c5 * 2 + 1, __c6 * 2, __c6 * 2 + 1, __c7 * 2,  \
+      __c7 * 2 + 1))
 
 #define wasm_v32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3)                   \
-  ((v128_t)(__builtin_shufflevector((__u32x4)(__a), (__u32x4)(__b), __c0,      \
-                                    __c1, __c2, __c3)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 4, __c0 * 4 + 1, __c0 * 4 + 2,    \
+      __c0 * 4 + 3, __c1 * 4, __c1 * 4 + 1, __c1 * 4 + 2, __c1 * 4 + 3,        \
+      __c2 * 4, __c2 * 4 + 1, __c2 * 4 + 2, __c2 * 4 + 3, __c3 * 4,            \
+      __c3 * 4 + 1, __c3 * 4 + 2, __c3 * 4 + 3))
 
 #define wasm_v64x2_shuffle(__a, __b, __c0, __c1)                               \
-  ((v128_t)(                                                                   \
-      __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), __c0, __c1)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 8, __c0 * 8 + 1, __c0 * 8 + 2,    \
+      __c0 * 8 + 3, __c0 * 8 + 4, __c0 * 8 + 5, __c0 * 8 + 6, __c0 * 8 + 7,    \
+      __c1 * 8, __c1 * 8 + 1, __c1 * 8 + 2, __c1 * 8 + 3, __c1 * 8 + 4,        \
+      __c1 * 8 + 5, __c1 * 8 + 6, __c1 * 8 + 7))
 
 #ifdef __wasm_unimplemented_simd128__
 

diff  --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 729a677c2485..5f50861cfa82 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -724,5 +724,14 @@ i32x4 widen_high_u_i32x4_i16x8(i16x8 v) {
 i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
   return __builtin_wasm_swizzle_v8x16(x, y);
   // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
+}
+
+i8x16 shuffle(i8x16 x, i8x16 y) {
+  return __builtin_wasm_shuffle_v8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7,
+                                      8, 9, 10, 11, 12, 13, 14, 15);
+  // WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+  // WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+  // WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
+  // WEBASSEMBLY-SAME: i32 15
   // WEBASSEMBLY-NEXT: ret
 }

diff  --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 90b5a25d16c0..a95740cc0dcd 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -104,6 +104,13 @@ def int_wasm_swizzle :
   Intrinsic<[llvm_v16i8_ty],
             [llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_shuffle :
+  Intrinsic<[llvm_v16i8_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_saturate_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
@@ -116,7 +123,6 @@ def int_wasm_avgr_unsigned :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
             [IntrNoMem, IntrSpeculatable]>;
-
 def int_wasm_bitselect :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
@@ -170,7 +176,6 @@ def int_wasm_widen_high_unsigned :
             [llvm_anyvector_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
-
 //===----------------------------------------------------------------------===//
 // Bulk memory intrinsics
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 303b33e128d5..4a4585814381 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1353,6 +1353,24 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                            Op.getOperand(3)  // thrown value
                        });
   }
+
+  case Intrinsic::wasm_shuffle: {
+    // Drop in-chain and replace undefs, but otherwise pass through unchanged
+    SDValue Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = Op.getOperand(1);
+    Ops[OpIdx++] = Op.getOperand(2);
+    while (OpIdx < 18) {
+      const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
+      if (MaskIdx.isUndef() ||
+          cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
+        Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
+      } else {
+        Ops[OpIdx++] = MaskIdx;
+      }
+    }
+    return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
+  }
   }
 }
 

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 6e0fadc9d708..c827fb7e5242 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -141,6 +141,36 @@ define <16 x i8> @narrow_unsigned_v16i8(<8 x i16> %low, <8 x i16> %high) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shuffle_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <16 x i8> @llvm.wasm.shuffle(
+  <16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+  i32, i32, i32, i32, i32)
+define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+      i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35)
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: shuffle_undef_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+      i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 2)
+  ret <16 x i8> %res
+}
+
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================