[llvm] d30155f - [PowerPC] Implementation of 128-bit Binary Vector Rotate builtins

Fri Oct 16 15:04:06 PDT 2020

Author: Albion Fung
Date: 2020-10-16T18:03:22-04:00
New Revision: d30155feaa9c4ddd09cb115fb30ea5810f63af9c

URL: https://github.com/llvm/llvm-project/commit/d30155feaa9c4ddd09cb115fb30ea5810f63af9c
DIFF: https://github.com/llvm/llvm-project/commit/d30155feaa9c4ddd09cb115fb30ea5810f63af9c.diff

LOG: [PowerPC] Implementation of 128-bit Binary Vector Rotate builtins

This patch implements 128-bit Binary Vector Rotate builtins for PowerPC10.

Differential Revision: https://reviews.llvm.org/D86819

Added: 
    llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll

Modified: 
    clang/include/clang/Basic/BuiltinsPPC.def
    clang/lib/Headers/altivec.h
    clang/test/CodeGen/builtins-ppc-p10vector.c
    llvm/include/llvm/IR/IntrinsicsPowerPC.td
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCInstrPrefix.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 015411abc508..fe20c9418e8d 100644

--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -422,6 +422,10 @@ BUILTIN(__builtin_altivec_vextduwvrx, "V2ULLiV4UiV4UiUi", "")
 BUILTIN(__builtin_altivec_vextddvlx, "V2ULLiV2ULLiV2ULLiUi", "")
 BUILTIN(__builtin_altivec_vextddvrx, "V2ULLiV2ULLiV2ULLiUi", "")
 
+// P10 Vector rotate built-ins.
+BUILTIN(__builtin_altivec_vrlqmi, "V1ULLLiV1ULLLiV1ULLLiV1ULLLi", "")
+BUILTIN(__builtin_altivec_vrlqnm, "V1ULLLiV1ULLLiV1ULLLi", "")
+
 // VSX built-ins.
 
 BUILTIN(__builtin_vsx_lxvd2x, "V2divC*", "")

diff  --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h
index 1d7bc201d330..2df420d640f1 100644
--- a/clang/lib/Headers/altivec.h
+++ b/clang/lib/Headers/altivec.h
@@ -7927,6 +7927,18 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rl(vector signed __int128 __a, vector unsigned __int128 __b) {
+  return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector signed __int128)) - __a));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rl(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector unsigned __int128)) - __a));
+}
+#endif
+
 /* vec_rlmi */
 #ifdef __POWER9_VECTOR__
 static __inline__ vector unsigned int __ATTRS_o_ai
@@ -7940,8 +7952,24 @@ vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
          vector unsigned long long __c) {
   return __builtin_altivec_vrldmi(__a, __c, __b);
 }
+#endif
+
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rlmi(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vrlqmi(__a, __c, __b);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rlmi(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vrlqmi(__a, __c, __b);
+}
+#endif
 
 /* vec_rlnm */
+#ifdef __POWER9_VECTOR__
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_rlnm(vector unsigned int __a, vector unsigned int __b,
          vector unsigned int __c) {
@@ -7957,6 +7985,42 @@ vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
 }
 #endif
 
+#ifdef __POWER10_VECTOR__
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_rlnm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  // Merge __b and __c using an appropriate shuffle.
+  vector unsigned char TmpB = (vector unsigned char)__b;
+  vector unsigned char TmpC = (vector unsigned char)__c;
+  vector unsigned char MaskAndShift =
+#ifdef __LITTLE_ENDIAN__
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0,
+                              1, -1, -1, -1, -1, -1);
+#else
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1,
+                              -1, -1, -1, -1, -1, -1, -1);
+#endif
+   return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_rlnm(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  // Merge __b and __c using an appropriate shuffle.
+  vector unsigned char TmpB = (vector unsigned char)__b;
+  vector unsigned char TmpC = (vector unsigned char)__c;
+  vector unsigned char MaskAndShift =
+#ifdef __LITTLE_ENDIAN__
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0,
+                              1, -1, -1, -1, -1, -1);
+#else
+      __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1,
+                              -1, -1, -1, -1, -1, -1, -1);
+#endif
+  return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift);
+}
+#endif
+
 /* vec_vrlb */
 
 static __inline__ vector signed char __ATTRS_o_ai

diff  --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c
index 21ddf06b61f2..427ac47d17e4 100644
--- a/clang/test/CodeGen/builtins-ppc-p10vector.c
+++ b/clang/test/CodeGen/builtins-ppc-p10vector.c
@@ -17,7 +17,7 @@ vector signed int vsia, vsib;
 vector unsigned int vuia, vuib, vuic;
 vector signed long long vslla, vsllb;
 vector unsigned long long vulla, vullb, vullc;
-vector signed __int128 vsi128a, vsi128b;
+vector signed __int128 vsi128a, vsi128b, vsi128c;
 vector unsigned __int128 vui128a, vui128b, vui128c;
 vector float vfa, vfb;
 vector double vda, vdb;
@@ -1880,3 +1880,53 @@ int test_vec_all_ge_u128(void) {
   // CHECK-NEXT: ret i32
   return vec_all_ge(vui128a, vui128b);
 }
+
+vector signed __int128 test_vec_rl_s128(void) {
+  // CHECK-LABEL: @test_vec_rl_s128(
+  // CHECK: sub <1 x i128>
+  // CHECK-NEXT: lshr <1 x i128>
+  // CHECK-NEXT: or <1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rl(vsi128a, vsi128b);
+}
+
+vector unsigned __int128 test_vec_rl_u128(void) {
+  // CHECK-LABEL: @test_vec_rl_u128(
+  // CHECK: sub <1 x i128>
+  // CHECK: lshr <1 x i128>
+  // CHECK: or <1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rl(vui128a, vui128b);
+}
+
+vector signed __int128 test_vec_rlnm_s128(void) {
+  // CHECK-LABEL: @test_vec_rlnm_s128(
+  // CHECK-LE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK-BE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlnm(vsi128a, vsi128b, vsi128c);
+}
+
+vector unsigned __int128 test_vec_rlnm_u128(void) {
+  // CHECK-LABEL: @test_vec_rlnm_u128(
+  // CHECK-LE:  %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK-BE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlnm(vui128a, vui128b, vui128c);
+}
+
+vector signed __int128 test_vec_rlmi_s128(void) {
+  // CHECK-LABEL: @test_vec_rlmi_s128(
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlmi(vsi128a, vsi128b, vsi128c);
+}
+
+vector unsigned __int128 test_vec_rlmi_u128(void) {
+  // CHECK-LABEL: @test_vec_rlmi_u128(
+  // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>
+  // CHECK-NEXT: ret <1 x i128>
+  return vec_rlmi(vui128a, vui128b, vui128c);
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 9f3dd637a9f8..28eda03d661f 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1057,6 +1057,15 @@ def int_ppc_altivec_vrldmi :
                             [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
                             [IntrNoMem]>;
 
+def int_ppc_altivec_vrlqnm :
+      PowerPC_Vec_Intrinsic<"vrlqnm", [llvm_v1i128_ty],
+                           [llvm_v1i128_ty, llvm_v1i128_ty],
+                            [IntrNoMem]>;
+def int_ppc_altivec_vrlqmi :
+      PowerPC_Vec_Intrinsic<"vrlqmi", [llvm_v1i128_ty],
+                            [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty],
+                            [IntrNoMem]>;
+
 // Vector Divide Extended Intrinsics.
 def int_ppc_altivec_vdivesw : PowerPC_Vec_WWW_Intrinsic<"vdivesw">;
 def int_ppc_altivec_vdiveuw : PowerPC_Vec_WWW_Intrinsic<"vdiveuw">;

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index e93d5d04b3ec..d9b9689a8a0e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -892,6 +892,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SREM, MVT::v1i128, Legal);
       setOperationAction(ISD::UDIV, MVT::v1i128, Legal);
       setOperationAction(ISD::SDIV, MVT::v1i128, Legal);
+      setOperationAction(ISD::ROTL, MVT::v1i128, Legal);
     }
 
     setOperationAction(ISD::MUL, MVT::v8i16, Legal);

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 384cf1c08fbd..2c118f7d293f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -2111,10 +2111,16 @@ let Predicates = [IsISA3_1] in {
                                "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>;
   def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
                                "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>;
-  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm", []>;
+  def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm",
+                               [(set v1i128:$vD,
+                                   (int_ppc_altivec_vrlqnm v1i128:$vA,
+                                                           v1i128:$vB))]>;
   def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
                         (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
-                        "vrlqmi $vD, $vA, $vB", IIC_VecFP, []>,
+                        "vrlqmi $vD, $vA, $vB", IIC_VecFP,
+                        [(set v1i128:$vD,
+                          (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB,
+                                                  v1i128:$vDi))]>,
                         RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
   def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
   def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
@@ -2175,6 +2181,9 @@ let Predicates = [IsISA3_1] in {
              (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>;
   def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)),
              (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>;
+
+  def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
+            (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
 }
 
 let Predicates = [IsISA3_1, HasVSX] in {

diff  --git a/llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll b/llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll
new file mode 100644
index 000000000000..477f723f5c9d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s -check-prefixes=CHECK-LE,CHECK
+
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s -check-prefixes=CHECK-BE,CHECK
+
+; This test case aims to test the builtins for vector rotate instructions
+; on Power10.
+
+
+define <1 x i128> @test_vrlq(<1 x i128> %x, <1 x i128> %y) {
+; CHECK-LABEL: test_vrlq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrlq v2, v3, v2
+; CHECK-NEXT:    blr
+  %shl.i = shl <1 x i128> %y, %x
+  %sub.i = sub <1 x i128> <i128 128>, %x
+  %lshr.i = lshr <1 x i128> %y, %sub.i
+  %tmp = or <1 x i128> %shl.i, %lshr.i
+  ret <1 x i128> %tmp
+}
+
+define <1 x i128> @test_vrlq_cost_mult8(<1 x i128> %x) {
+; CHECK-LABEL: test_vrlq_cost_mult8:
+; CHECK: # %bb.0:
+; CHECK: vrlq v2, v3, v2
+; CHECK-NEXT: blr
+  %shl.i = shl <1 x i128> <i128 16>, %x
+  %sub.i = sub <1 x i128> <i128 128>, %x
+  %lshr.i = lshr <1 x i128> <i128 16>, %sub.i
+  %tmp = or <1 x i128> %shl.i, %lshr.i
+  ret <1 x i128> %tmp
+}
+
+define <1 x i128> @test_vrlq_cost_non_mult8(<1 x i128> %x) {
+; CHECK-LABEL: test_vrlq_cost_non_mult8:
+; CHECK: # %bb.0:
+; CHECK: vrlq v2, v3, v2
+; CHECK-NEXT: blr
+  %shl.i = shl <1 x i128> <i128 4>, %x
+  %sub.i = sub <1 x i128> <i128 128>, %x
+  %lshr.i = lshr <1 x i128> <i128 4>, %sub.i
+  %tmp = or <1 x i128> %shl.i, %lshr.i
+  ret <1 x i128> %tmp
+}
+
+; Function Attrs: nounwind readnone
+define <1 x i128> @test_vrlqmi(<1 x i128> %a, <1 x i128> %b, <1 x i128> %c) {
+; CHECK-LABEL: test_vrlqmi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrlqmi v3, v2, v4
+; CHECK-NEXT:    vmr v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %tmp = tail call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128> %a, <1 x i128> %c, <1 x i128> %b)
+  ret <1 x i128> %tmp
+}
+
+; Function Attrs: nounwind readnone
+define <1 x i128> @test_vrlqnm(<1 x i128> %a, <1 x i128> %b, <1 x i128> %c) {
+; CHECK-LABEL: test_vrlqnm:
+; CHECK:            # %bb.0: # %entry
+; CHECK-BE:         lxvx v5
+; CHECK-BE-NEXT:    vperm v3, v3, v4, v5
+; CHECK-LE-NEXT:    plxv v5
+; CHECK-LE-NEXT:    vperm v3, v4, v3, v5
+; CHECK-NEXT:       vrlqnm v2, v2, v3
+; CHECK-NEXT:       blr
+entry:
+  %0 = bitcast <1 x i128> %b to <16 x i8>
+  %1 = bitcast <1 x i128> %c to <16 x i8>
+  %shuffle.i = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %d = bitcast <16 x i8> %shuffle.i to <1 x i128>
+  %tmp = tail call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128> %a, <1 x i128> %d)
+  ret <1 x i128> %tmp
+}
+
+; Function Attrs: nounwind readnone
+declare <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>, <1 x i128>, <1 x i128>)
+
+; Function Attrs: nounwind readnone
+declare <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>, <1 x i128>)