[PATCH] D12334: [ARM] Do not use vtrn for vectorshuffle if the order is reversed

Tue Aug 25 14:26:10 PDT 2015

jketema created this revision.
jketema added reviewers: rengolin, aemerson.
jketema added a subscriber: llvm-commits.
Herald added subscribers: rengolin, aemerson.

The tests in isVTRNMask and isVTRN_v_undef_Mask should also check that the elements of the upper and lower half of the vectorshuffle occur in the correct order when both halves are used. Without this test the code assumes that it is correct to use vector transpose (vtrn) for the masks <1, 1, 0, 0> and <1, 3, 0, 2>, among others, but the transpose actually incorrectly generates shuffles for <0, 0, 1, 1> and <0, 2, 1, 3> in this case.

http://reviews.llvm.org/D12334

Files:
  lib/Target/ARM/ARMISelLowering.cpp
  test/CodeGen/ARM/vuzp.ll
  test/CodeGen/ARM/vzip.ll

Index: test/CodeGen/ARM/vzip.ll
===================================================================

--- test/CodeGen/ARM/vzip.ll
+++ test/CodeGen/ARM/vzip.ll
@@ -295,3 +295,13 @@
   ret <4 x i32> %0
 }
 
+define void @vzip_undef_rev_shufflemask_vtrn(<2 x i32>* %A, <4 x i32>* %B) {
+entry:
+  ; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn
+  ; CHECK-NOT: vtrn
+  ; CHECK: vzip
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+  store <4 x i32> %0, <4 x i32>* %B
+  ret void
+}
Index: test/CodeGen/ARM/vuzp.ll
===================================================================
--- test/CodeGen/ARM/vuzp.ll
+++ test/CodeGen/ARM/vuzp.ll
@@ -286,6 +286,19 @@
   ret <4 x i32> %0
 }
 
+define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
+entry:
+  ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn
+  ; CHECK-NOT: vtrn
+  ; CHECK: vuzp
+  %tmp1 = load <2 x i32>, <2 x i32>* %A
+  %tmp2 = load <2 x i32>, <2 x i32>* %B
+  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  store <4 x i32> %0, <4 x i32>* %C
+  ret void
+}
+
+
 define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -5045,8 +5045,16 @@
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
-  // If the mask is twice as long as the result then we need to check the upper
-  // and lower parts of the mask
+  // If the mask is twice as long as the input vector then we need to check that
+  // the upper part of the mask has smaller values than the lower part.
+  if (M.size() == NumElts * 2)
+    for (unsigned i = 0; i < NumElts; ++i)
+      if (M[i] >= 0 && M[i + NumElts] >= 0 &&
+          (unsigned) M[i] >= (unsigned) M[i+NumElts])
+        return false;
+
+  // If the mask is twice as long as the input vector then we need to check the
+  // upper and lower parts of the mask
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = M[i] == 0 ? 0 : 1;
     for (unsigned j = 0; j < NumElts; j += 2) {
@@ -5074,6 +5082,12 @@
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
+  if (M.size() == NumElts * 2)
+    for (unsigned i = 0; i < NumElts; ++i)
+      if (M[i] >= 0 && M[i + NumElts] >= 0 &&
+          (unsigned) M[i] >= (unsigned) M[i+NumElts])
+        return false;
+
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = M[i] == 0 ? 0 : 1;
     for (unsigned j = 0; j < NumElts; j += 2) {


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D12334.33120.patch
Type: text/x-patch
Size: 2949 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150825/22705693/attachment.bin>