[clang] 0395f9e - [ARM] Neon Polynomial vadd Intrinsic fix

Wed Apr 28 12:00:57 PDT 2021

Author: Ryan Santhirarajan
Date: 2021-04-28T11:59:40-07:00
New Revision: 0395f9e70b8f2dc55d3ef0bb7195c3542deffc5a

URL: https://github.com/llvm/llvm-project/commit/0395f9e70b8f2dc55d3ef0bb7195c3542deffc5a
DIFF: https://github.com/llvm/llvm-project/commit/0395f9e70b8f2dc55d3ef0bb7195c3542deffc5a.diff

LOG: [ARM] Neon Polynomial vadd Intrinsic fix

The Neon vadd intrinsics were added to the ARMSIMD intrinsic map,
however due to being defined under an AArch64 guard in arm_neon.td,
were not previously useable on ARM. This change rectifies that.

It is important to note that poly128 is not valid on ARM, thus it was
extracted out of the original arm_neon.td definition and separated
for the sake of AArch64.

Reviewed By: DavidSpickett

Differential Revision: https://reviews.llvm.org/D100772

Added: 
    clang/test/CodeGen/arm-poly-add.c

Modified: 
    clang/include/clang/Basic/arm_neon.td
    clang/lib/CodeGen/CGBuiltin.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 0d97f0a1c2dc..fdd82692223c 100644

--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -708,6 +708,11 @@ def SCALAR_HALF_SET_LANE : IOpInst<"vset_lane", ".1.I", "h", OP_SCALAR_HALF_SET_
 def SCALAR_HALF_GET_LANEQ : IOpInst<"vget_lane", "1.I", "Qh", OP_SCALAR_HALF_GET_LNQ>;
 def SCALAR_HALF_SET_LANEQ : IOpInst<"vset_lane", ".1.I", "Qh", OP_SCALAR_HALF_SET_LNQ>;
 
+////////////////////////////////////////////////////////////////////////////////
+// Non poly128_t vaddp for Arm and AArch64
+// TODO: poly128_t not implemented on arm32
+def VADDP   : WInst<"vadd", "...", "PcPsPlQPcQPsQPl">;
+
 ////////////////////////////////////////////////////////////////////////////////
 // AArch64 Intrinsics
 
@@ -1171,7 +1176,9 @@ def SM4E : SInst<"vsm4e", "...", "QUi">;
 def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">;
 }
 
-def VADDP   : WInst<"vadd", "...", "PcPsPlQPcQPsQPlQPk">;
+////////////////////////////////////////////////////////////////////////////////
+// poly128_t vadd for AArch64 only see VADDP for the rest
+def VADDP_Q   : WInst<"vadd", "...", "QPk">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Float -> Int conversions with explicit rounding mode

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9681a8ad5cd3..8990cd825af3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5460,7 +5460,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP1(vabsq_v, arm_neon_vabs, 0),
   NEONMAP0(vadd_v),
   NEONMAP0(vaddhn_v),
-  NEONMAP0(vaddq_p128),
   NEONMAP0(vaddq_v),
   NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
   NEONMAP1(vaeseq_v, arm_neon_aese, 0),

diff  --git a/clang/test/CodeGen/arm-poly-add.c b/clang/test/CodeGen/arm-poly-add.c
new file mode 100644
index 000000000000..fe2afca00d47
--- /dev/null
+++ b/clang/test/CodeGen/arm-poly-add.c
@@ -0,0 +1,86 @@
+// REQUIRES: arm-registered-target
+// RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi \
+// RUN:   -target-feature +neon \
+// RUN:   -mfloat-abi hard \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vadd_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) {
+  return vadd_p8 (a, b);
+}
+
+// CHECK-LABEL: @test_vadd_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+//
+poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) {
+  return vadd_p16 (a, b);
+}
+
+// CHECK-LABEL: @test_vadd_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP3]]
+//
+poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) {
+  return vadd_p64(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){
+  return vaddq_p8(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){
+  return vaddq_p16(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+//
+poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){
+  return vaddq_p64(a, b);
+}
+
+// TODO: poly128_t not implemented on aarch32
+// CHCK-LABEL: @test_vaddq_p128(
+// CHCK-NEXT:  entry:
+// CHCK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
+// CHCK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[B:%.*]] to <16 x i8>
+// CHCK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
+// CHCK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHCK-NEXT:    ret i128 [[TMP3]]
+//
+//poly128_t test_vaddq_p128 (poly128_t a, poly128_t b){
+//  return vaddq_p128(a, b);