[PATCH] D37892: [X86] Use native shuffle vector for the perm2f128 intrinsics

Thu Sep 14 23:36:31 PDT 2017

craig.topper created this revision.

This patch replaces the perm2f128 intrinsics with native shuffle vectors.

This uses a pretty simple approach to allocate source 0 to the lower half input and source 1 to the upper half input. Then its just a matter of filling in the indices to use either the lower or upper half of that specific source. This can result in the same source being used by both operands. InstCombine or SelectionDAGBuilder should be able to clean that up.


https://reviews.llvm.org/D37892

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/avx-builtins.c


Index: test/CodeGen/avx-builtins.c
===================================================================

--- test/CodeGen/avx-builtins.c
+++ test/CodeGen/avx-builtins.c
@@ -678,19 +678,19 @@
 
 __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_permute2f128_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 49)
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   return _mm256_permute2f128_pd(A, B, 0x31);
 }
 
 __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_permute2f128_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 19)
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   return _mm256_permute2f128_ps(A, B, 0x13);
 }
 
 __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
   // CHECK-LABEL: test_mm256_permute2f128_si256
-  // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 32)
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_permute2f128_si256(A, B, 0x20);
 }
 
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -7923,6 +7923,44 @@
     return EmitX86Select(*this, Ops[4], Align, Ops[3]);
   }
 
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256: {
+    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+    unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+
+    // This takes a very simple approach since there are two lanes and a
+    // shuffle can have 2 inputs. So we reserve the first input for the first
+    // lane and the second input for the second lane. This may result in
+    // duplicate sources, but this can be dealt with in the backend.
+
+    Value *OutOps[2];
+    uint32_t Indices[8];
+    for (unsigned l = 0; l != 2; ++l) {
+      // Determine the source for this lane.
+      if (Imm & (1 << ((l * 4) + 3)))
+        OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
+      else if (Imm & (1 << ((l * 4) + 1)))
+        OutOps[l] = Ops[1];
+      else
+        OutOps[l] = Ops[0];
+
+      for (unsigned i = 0; i != NumElts/2; ++i) {
+        // Start with ith element of the source for this lane.
+        unsigned Idx = (l * NumElts) + i;
+        // If bit 0 of the immediate is set, switch to the high half of
+        // the source.
+        if (Imm & (1 << (l * 4)))
+          Idx += NumElts/2;
+        Indices[(l * (NumElts/2)) + i] = Idx;
+      }
+    }
+
+    return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
+                                       makeArrayRef(Indices, NumElts),
+                                       "vperm");
+  }
+
   case X86::BI__builtin_ia32_movnti:
   case X86::BI__builtin_ia32_movnti64:
   case X86::BI__builtin_ia32_movntsd:


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D37892.115362.patch
Type: text/x-patch
Size: 3270 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170915/8f40457d/attachment.bin>