r181865 - ARM: Improve codegen for vget_low_* and vget_high_ intrinsics.

Tue May 14 19:40:04 PDT 2013

Author: grosbach
Date: Tue May 14 21:40:04 2013
New Revision: 181865

URL: http://llvm.org/viewvc/llvm-project?rev=181865&view=rev
Log:
ARM: Improve codegen for vget_low_* and vget_high_ intrinsics.

These intrinsics use the __builtin_shuffle() function to extract the
low and high half, respectively, of a 128-bit NEON vector. Currently,
they're defined to use bitcasts to simplify the emitter, so we get code
like:
uint16x4_t vget_low_u32(uint16x8_t __a) {
  return (uint32x2_t) __builtin_shufflevector((int64x2_t) __a,
                                              (int64x2_t) __a,
                                              0);
}

While this works, it results in those bitcasts going all the way through
to the IR, resulting in code like:
  %1 = bitcast <8 x i16> %in to <2 x i64>
  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32>
  %zeroinitializer
  %3 = bitcast <1 x i64> %2 to <4 x i16>

We can instead easily perform the operation directly on the input vector
like:

uint16x4_t vget_low_u16(uint16x8_t __a) {
  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
}

Not only is that much easier to read on its own, it also results in
cleaner IR like:

  %1 = shufflevector <8 x i16> %in, <8 x i16> undef,
                     <4 x i32> <i32 0, i32 1, i32 2, i32 3>

This is both easier to read and easier for the back end to reason
about effectively since the operation is obfuscating the source with
bitcasts.

rdar://13894163

Added:
    cfe/trunk/test/CodeGen/arm-neon-vget.c
Modified:
    cfe/trunk/utils/TableGen/NeonEmitter.cpp

Added: cfe/trunk/test/CodeGen/arm-neon-vget.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/arm-neon-vget.c?rev=181865&view=auto
==============================================================================

--- cfe/trunk/test/CodeGen/arm-neon-vget.c (added)
+++ cfe/trunk/test/CodeGen/arm-neon-vget.c Tue May 14 21:40:04 2013
@@ -0,0 +1,124 @@
+// REQUIRES: arm-registered-target
+// RUN: %clang_cc1 -triple thumbv7-apple-darwin \
+// RUN:   -target-abi apcs-gnu \
+// RUN:   -target-cpu cortex-a8 \
+// RUN:   -mfloat-abi soft \
+// RUN:   -target-feature +soft-float-abi \
+// RUN:   -ffreestanding \
+// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+
+#include <arm_neon.h>
+
+// Check that the vget_low/vget_high intrinsics generate a single shuffle
+// without any bitcasting.
+int8x8_t low_s8(int8x16_t a) {
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return vget_low_s8(a);
+}
+
+uint8x8_t low_u8 (uint8x16_t a) {
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return vget_low_u8(a);
+}
+
+int16x4_t low_s16( int16x8_t a) {
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return vget_low_s16(a);
+}
+
+uint16x4_t low_u16(uint16x8_t a) {
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return vget_low_u16(a);
+}
+
+int32x2_t low_s32( int32x4_t a) {
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  return vget_low_s32(a);
+}
+
+uint32x2_t low_u32(uint32x4_t a) {
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  return vget_low_u32(a);
+}
+
+int64x1_t low_s64( int64x2_t a) {
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  return vget_low_s64(a);
+}
+
+uint64x1_t low_u64(uint64x2_t a) {
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  return vget_low_u64(a);
+}
+
+poly8x8_t low_p8 (poly8x16_t a) {
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return vget_low_p8(a);
+}
+
+poly16x4_t low_p16(poly16x8_t a) {
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return vget_low_p16(a);
+}
+
+float32x2_t low_f32(float32x4_t a) {
+// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  return vget_low_f32(a);
+}
+
+
+int8x8_t high_s8(int8x16_t a) {
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return vget_high_s8(a);
+}
+
+uint8x8_t high_u8 (uint8x16_t a) {
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return vget_high_u8(a);
+}
+
+int16x4_t high_s16( int16x8_t a) {
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  return vget_high_s16(a);
+}
+
+uint16x4_t high_u16(uint16x8_t a) {
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  return vget_high_u16(a);
+}
+
+int32x2_t high_s32( int32x4_t a) {
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  return vget_high_s32(a);
+}
+
+uint32x2_t high_u32(uint32x4_t a) {
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  return vget_high_u32(a);
+}
+
+int64x1_t high_s64( int64x2_t a) {
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  return vget_high_s64(a);
+}
+
+uint64x1_t high_u64(uint64x2_t a) {
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  return vget_high_u64(a);
+}
+
+poly8x8_t high_p8 (poly8x16_t a) {
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return vget_high_p8(a);
+}
+
+poly16x4_t high_p16(poly16x8_t a) {
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  return vget_high_p16(a);
+}
+
+float32x2_t high_f32(float32x4_t a) {
+// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  return vget_high_f32(a);
+}
+

Modified: cfe/trunk/utils/TableGen/NeonEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/utils/TableGen/NeonEmitter.cpp?rev=181865&r1=181864&r2=181865&view=diff
==============================================================================
--- cfe/trunk/utils/TableGen/NeonEmitter.cpp (original)
+++ cfe/trunk/utils/TableGen/NeonEmitter.cpp Tue May 14 21:40:04 2013
@@ -1410,12 +1410,17 @@ static std::string GenOpString(OpKind op
     s += ", (int64x1_t)__b, 0, 1);";
     break;
   case OpHi:
-    s += "(" + ts +
-      ")__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1);";
+    // nElts is for the result vector, so the source is twice that number.
+    s += "__builtin_shufflevector(__a, __a";
+    for (unsigned i = nElts; i < nElts * 2; ++i)
+      s += ", " + utostr(i);
+    s+= ");";
     break;
   case OpLo:
-    s += "(" + ts +
-      ")__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0);";
+    s += "__builtin_shufflevector(__a, __a";
+    for (unsigned i = 0; i < nElts; ++i)
+      s += ", " + utostr(i);
+    s+= ");";
     break;
   case OpDup:
     s += Duplicate(nElts, typestr, "__a") + ";";