[llvm-commits] [patch] ARM NEON VREV patterns

Fri Jul 24 04:29:54 PDT 2009

Index: test/CodeGen/ARM/vrev.ll
===================================================================

--- test/CodeGen/ARM/vrev.ll    (revision 0)
+++ test/CodeGen/ARM/vrev.ll    (revision 0)
@@ -0,0 +1,175 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon >%t
+; RUN: grep {vrev64\\.8} %t | count 2
+; RUN: grep {vrev64\\.16} %t | count 2
+; RUN: grep {vrev64\\.32} %t | count 2
+; RUN: grep {vrev32\\.8} %t | count 2
+; RUN: grep {vrev32\\.16} %t | count 2
+; RUN: grep {vrev16\\.8} %t | count 2
+
+define arm_apcscc void @test_vrev16Qs8() nounwind {
+entry:
+       %arg0_int8x16_t = alloca <16 x i8>              ; <<16 x i8>*> [#uses=1]
+       %out_int8x16_t = alloca <16 x i8>               ; <<16 x i8>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <16 x i8>* %arg0_int8x16_t, align 16          ; <<16 x i8>> [#uses=1]
+       %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>               ; <<16 x i8>> [#uses=1]
+       store <16 x i8> %1, <16 x i8>* %out_int8x16_t, align 16
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev16u8() nounwind {
+entry:
+       %arg0_uint8x8_t = alloca <8 x i8>               ; <<8 x i8>*> [#uses=1]
+       %out_uint8x8_t = alloca <8 x i8>                ; <<8 x i8>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <8 x i8>* %arg0_uint8x8_t, align 8            ; <<8 x i8>> [#uses=1]
+       %1 = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>                ; <<8 x i8>> [#uses=1]
+       store <8 x i8> %1, <8 x i8>* %out_uint8x8_t, align 8
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev32Qs16() nounwind {
+entry:
+       %arg0_uint16x8_t = alloca <8 x i16>             ; <<8 x i16>*> [#uses=1]
+       %out_uint16x8_t = alloca <8 x i16>              ; <<8 x i16>*> [#uses=1]
+       %"alloca point" = bitcast i16 0 to i16  ; <i16> [#uses=0]
+       %0 = load <8 x i16>* %arg0_uint16x8_t, align 16         ; <<8 x i16>> [#uses=1]
+       %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>              ; <<8 x i16>> [#uses=1]
+       store <8 x i16> %1, <8 x i16>* %out_uint16x8_t, align 16
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev32Qs8() nounwind {
+entry:
+       %arg0_int8x16_t = alloca <16 x i8>              ; <<16 x i8>*> [#uses=1]
+       %out_int8x16_t = alloca <16 x i8>               ; <<16 x i8>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <16 x i8>* %arg0_int8x16_t, align 16          ; <<16 x i8>> [#uses=1]
+       %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>               ; <<16 x i8>> [#uses=1]
+       store <16 x i8> %1, <16 x i8>* %out_int8x16_t, align 16
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev32u16() nounwind {
+entry:
+       %arg0_uint16x4_t = alloca <4 x i16>             ; <<4 x i16>*> [#uses=1]
+       %out_uint16x4_t = alloca <4 x i16>              ; <<4 x i16>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <4 x i16>* %arg0_uint16x4_t, align 8          ; <<4 x i16>> [#uses=1]
+       %1 = shufflevector <4 x i16> %0, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>                ; <<4 x i16>> [#uses=1]
+       store <4 x i16> %1, <4 x i16>* %out_uint16x4_t, align 8
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev32u8() nounwind {
+entry:
+       %arg0_uint8x8_t = alloca <8 x i8>               ; <<8 x i8>*> [#uses=1]
+       %out_uint8x8_t = alloca <8 x i8>                ; <<8 x i8>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <8 x i8>* %arg0_uint8x8_t, align 8            ; <<8 x i8>> [#uses=1]
+       %1 = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>                ; <<8 x i8>> [#uses=1]
+       store <8 x i8> %1, <8 x i8>* %out_uint8x8_t, align 8
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev64Qs16() nounwind {
+entry:
+       %arg0_uint16x8_t = alloca <8 x i16>             ; <<8 x i16>*> [#uses=1]
+       %out_uint16x8_t = alloca <8 x i16>              ; <<8 x i16>*> [#uses=1]
+       %"alloca point" = bitcast i16 0 to i16  ; <i16> [#uses=0]
+       %0 = load <8 x i16>* %arg0_uint16x8_t, align 16         ; <<8 x i16>> [#uses=1]
+       %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>              ; <<8 x i16>> [#uses=1]
+       store <8 x i16> %1, <8 x i16>* %out_uint16x8_t, align 16
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev64Qs32() nounwind {
+entry:
+       %arg0_uint32x4_t = alloca <4 x i32>             ; <<4 x i32>*> [#uses=1]
+       %out_uint32x4_t = alloca <4 x i32>              ; <<4 x i32>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <4 x i32>* %arg0_uint32x4_t, align 32         ; <<4 x i32>> [#uses=1]
+       %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>                ; <<4 x i32>> [#uses=1]
+       store <4 x i32> %1, <4 x i32>* %out_uint32x4_t, align 32
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev64Qs8() nounwind {
+entry:
+       %arg0_int8x16_t = alloca <16 x i8>              ; <<16 x i8>*> [#uses=1]
+       %out_int8x16_t = alloca <16 x i8>               ; <<16 x i8>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <16 x i8>* %arg0_int8x16_t, align 16          ; <<16 x i8>> [#uses=1]
+       %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>               ; <<16 x i8>> [#uses=1]
+       store <16 x i8> %1, <16 x i8>* %out_int8x16_t, align 16
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev64u16() nounwind {
+entry:
+       %arg0_uint16x4_t = alloca <4 x i16>             ; <<4 x i16>*> [#uses=1]
+       %out_uint16x4_t = alloca <4 x i16>              ; <<4 x i16>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <4 x i16>* %arg0_uint16x4_t, align 8          ; <<4 x i16>> [#uses=1]
+       %1 = shufflevector <4 x i16> %0, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>                ; <<4 x i16>> [#uses=1]
+       store <4 x i16> %1, <4 x i16>* %out_uint16x4_t, align 8
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev64u32() nounwind {
+entry:
+       %arg0_uint32x2_t = alloca <2 x i32>             ; <<2 x i32>*> [#uses=1]
+       %out_uint32x2_t = alloca <2 x i32>              ; <<2 x i32>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <2 x i32>* %arg0_uint32x2_t, align 8          ; <<2 x i32>> [#uses=1]
+       %1 = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> <i32 1, i32 0>              ; <<2 x i32>> [#uses=1]
+       store <2 x i32> %1, <2 x i32>* %out_uint32x2_t, align 8
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}
+
+define arm_apcscc void @test_vrev64u8() nounwind {
+entry:
+       %arg0_uint8x8_t = alloca <8 x i8>               ; <<8 x i8>*> [#uses=1]
+       %out_uint8x8_t = alloca <8 x i8>                ; <<8 x i8>*> [#uses=1]
+       %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+       %0 = load <8 x i8>* %arg0_uint8x8_t, align 8            ; <<8 x i8>> [#uses=1]
+       %1 = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>                ; <<8 x i8>> [#uses=1]
+       store <8 x i8> %1, <8 x i8>* %out_uint8x8_t, align 8
+       br label %return
+
+return:                ; preds = %entry
+       ret void
+}

Property changes on: test/CodeGen/ARM/vrev.ll
___________________________________________________________________
Added: svn:mergeinfo

Index: lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- lib/Target/ARM/ARMInstrNEON.td      (revision 76881)
+++ lib/Target/ARM/ARMInstrNEON.td      (working copy)
@@ -1662,6 +1662,79 @@
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;

+def vrev64_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp,64);
+}]>;
+def vrev32_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp,32);
+}]>;
+def vrev16_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return ARM::isVREVMask(SVOp,16);
+}]>;
+
+def VREVQ64_8 : N2V<0b11,0b11,0b00,0b00,0b00000,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev64.8\t$dst, $src", "",
+                    [(set QPR:$dst, (v16i8 (vrev64_shuffle QPR:$src, undef)))]>;
+def VREVQ64_16 : N2V<0b11,0b11,0b01,0b00,0b00000,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev64.16\t$dst, $src", "",
+                    [(set QPR:$dst, (v8i16 (vrev64_shuffle QPR:$src, undef)))]>;
+def VREVQ64_32 : N2V<0b11,0b11,0b10,0b00,0b00000,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev64.32\t$dst, $src", "",
+                    [(set QPR:$dst, (v4i32 (vrev64_shuffle QPR:$src, undef)))]>;
+def VREVQ64_32f : N2V<0b11,0b11,0b10,0b00,0b00000,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev64.32\t$dst, $src", "",
+                    [(set QPR:$dst, (v4f32 (vrev64_shuffle QPR:$src, undef)))]>;
+def VREVQ32_8 : N2V<0b11,0b11,0b00,0b00,0b00001,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev32.8\t$dst, $src","",
+                    [(set QPR:$dst, (v16i8 (vrev32_shuffle QPR:$src, undef)))]>;
+def VREVQ32_16 : N2V<0b11,0b11,0b01,0b00,0b00001,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev32.16\t$dst, $src","",
+                    [(set QPR:$dst, (v8i16 (vrev32_shuffle QPR:$src, undef)))]>;
+def VREVQ16_8 : N2V<0b11,0b11,0b00,0b00,0b00010,1,0,
+                    (outs QPR:$dst), (ins QPR:$src),
+                    "vrev16.8\t$dst, $src","",
+                    [(set QPR:$dst, (v16i8 (vrev16_shuffle QPR:$src, undef)))]>;
+
+def VREVD64_8 : N2V<0b11,0b11,0b00,0b00,0b00000,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev64.8\t$dst, $src","",
+                   [(set DPR:$dst, (v8i8 (vrev64_shuffle DPR:$src, undef)))]>;
+def VREVD64_16 : N2V<0b11,0b11,0b01,0b00,0b00000,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev64.16\t$dst, $src","",
+                   [(set DPR:$dst, (v4i16 (vrev64_shuffle DPR:$src, undef)))]>;
+def VREVD64_32 : N2V<0b11,0b11,0b10,0b00,0b00000,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev64.32\t$dst, $src","",
+                   [(set DPR:$dst, (v2i32 (vrev64_shuffle DPR:$src, undef)))]>;
+def VREVD64_32f : N2V<0b11,0b11,0b10,0b00,0b00000,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev64.32\t$dst, $src","",
+                   [(set DPR:$dst, (v2f32 (vrev64_shuffle DPR:$src, undef)))]>;
+def VREVD32_8 : N2V<0b11,0b11,0b00,0b00,0b00001,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev32.8\t$dst, $src","",
+                   [(set DPR:$dst, (v8i8 (vrev32_shuffle DPR:$src, undef)))]>;
+def VREVD32_16 : N2V<0b11,0b11,0b01,0b00,0b00001,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev32.16\t$dst, $src","",
+                   [(set DPR:$dst, (v4i16 (vrev32_shuffle DPR:$src, undef)))]>;
+def VREVD16_8 : N2V<0b11,0b11,0b00,0b00,0b00010,0,0,
+                   (outs DPR:$dst), (ins DPR:$src),
+                   "vrev16.8\t$dst, $src","",
+                   [(set DPR:$dst, (v8i8 (vrev16_shuffle DPR:$src, undef)))]>;
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h    (revision 76881)
+++ lib/Target/ARM/ARMISelLowering.h    (working copy)
@@ -124,6 +124,7 @@
     /// return the constant being splatted.  The ByteSize field indicates the
     /// number of bytes of each element [1248].
     SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+    bool isVREVMask(ShuffleVectorSDNode *N, unsigned blocksize);
   }

   //===--------------------------------------------------------------------===//
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp  (revision 76881)
+++ lib/Target/ARM/ARMISelLowering.cpp  (working copy)
@@ -3341,3 +3341,23 @@
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
                                                       Ops, DAG);
 }
+
+bool ARM::isVREVMask(ShuffleVectorSDNode *N , unsigned BlockSize) {
+    assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+            "Only possible block sizes for VREV are: 16, 32, 64");
+
+    unsigned NumElts = N->getValueType(0).getVectorNumElements();
+    unsigned EltSz = N->getValueType(0).getVectorElementType().getSizeInBits();
+    unsigned BlockElts = N->getMaskElt(0) + 1;
+        // At location 0 should be last element from first block
+
+    if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+        return false;
+
+    for (unsigned i=0;i<NumElts;i++)
+        if ((unsigned)N->getMaskElt(i) !=
+            (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts) )
+            return false;
+
+    return true;
+}