[llvm] r247254 - [ARM] Do not use vtrn for vectorshuffle if the order is reversed

Thu Sep 10 01:42:29 PDT 2015

Author: jamesm
Date: Thu Sep 10 03:42:28 2015
New Revision: 247254

URL: http://llvm.org/viewvc/llvm-project?rev=247254&view=rev
Log:
[ARM] Do not use vtrn for vectorshuffle if the order is reversed

The tests in isVTRNMask and isVTRN_v_undef_Mask should also check that the elements of the upper and lower half of the vectorshuffle occur in the correct order when both halves are used. Without this test the code assumes that it is correct to use vector transpose (vtrn) for the masks <1, 1, 0, 0> and <1, 3, 0, 2>, among others, but the transpose actually incorrectly generates shuffles for <0, 0, 1, 1> and <0, 2, 1, 3> in this case.

Patch by Jeroen Ketema!

Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
    llvm/trunk/test/CodeGen/ARM/vtrn.ll
    llvm/trunk/test/CodeGen/ARM/vuzp.ll
    llvm/trunk/test/CodeGen/ARM/vzip.ll

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=247254&r1=247253&r2=247254&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Thu Sep 10 03:42:28 2015
@@ -5050,10 +5050,16 @@ static bool isVTRNMask(ArrayRef<int> M,
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
-  // If the mask is twice as long as the result then we need to check the upper
-  // and lower parts of the mask
+  // If the mask is twice as long as the input vector then we need to check the
+  // upper and lower parts of the mask with a matching value for WhichResult
+  // FIXME: A mask with only even values will be rejected in case the first
+  // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
+  // M[0] is used to determine WhichResult
   for (unsigned i = 0; i < M.size(); i += NumElts) {
-    WhichResult = M[i] == 0 ? 0 : 1;
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
@@ -5080,7 +5086,10 @@ static bool isVTRN_v_undef_Mask(ArrayRef
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
-    WhichResult = M[i] == 0 ? 0 : 1;
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))

Modified: llvm/trunk/test/CodeGen/ARM/vtrn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vtrn.ll?rev=247254&r1=247253&r2=247254&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vtrn.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vtrn.ll Thu Sep 10 03:42:28 2015
@@ -371,3 +371,31 @@ define <8 x i8> @vtrn_mismatched_builvec
   %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
   ret <8 x i8> %rv
 }
+
+; Negative test that should not generate a vtrn
+define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
+entry:
+  ; CHECK-LABEL: lower_twice_no_vtrn
+  ; CHECK: @ BB#0:
+  ; CHECK-NOT: vtrn
+  ; CHECK: mov pc, lr
+  %tmp1 = load <4 x i16>, <4 x i16>* %A
+  %tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
+  store <8 x i16> %0, <8 x i16>* %C
+  ret void
+}
+
+; Negative test that should not generate a vtrn
+define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
+entry:
+  ; CHECK-LABEL: upper_twice_no_vtrn
+  ; CHECK: @ BB#0:
+  ; CHECK-NOT: vtrn
+  ; CHECK: mov pc, lr
+  %tmp1 = load <4 x i16>, <4 x i16>* %A
+  %tmp2 = load <4 x i16>, <4 x i16>* %B
+  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
+  store <8 x i16> %0, <8 x i16>* %C
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/ARM/vuzp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vuzp.ll?rev=247254&r1=247253&r2=247254&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vuzp.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vuzp.ll Thu Sep 10 03:42:28 2015
@@ -286,6 +286,18 @@ entry:
   ret <4 x i32> %0
 }
 
+define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
+entry:
+  ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn
+  ; CHECK-NOT: vtrn
+  ; CHECK: vuzp
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %tmp2 = load <2 x i32>, <2 x i32>* %B
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  store <4 x i32> %0, <4 x i32>* %C
+  ret void
+}
+
 define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to

Modified: llvm/trunk/test/CodeGen/ARM/vzip.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vzip.ll?rev=247254&r1=247253&r2=247254&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vzip.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vzip.ll Thu Sep 10 03:42:28 2015
@@ -295,3 +295,13 @@ entry:
   ret <4 x i32> %0
 }
 
+define void @vzip_undef_rev_shufflemask_vtrn(<2 x i32>* %A, <4 x i32>* %B) {
+entry:
+  ; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn
+  ; CHECK-NOT: vtrn
+  ; CHECK: vzip
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+  store <4 x i32> %0, <4 x i32>* %B
+  ret void
+}