[llvm] cc9495f - [AArch64] Only mark cost 1 perfect shuffles as legal

Tue Apr 19 04:59:00 PDT 2022

Author: David Green
Date: 2022-04-19T12:58:55+01:00
New Revision: cc9495f6791aaaacbff55d7eace43550eb27e3c9

URL: https://github.com/llvm/llvm-project/commit/cc9495f6791aaaacbff55d7eace43550eb27e3c9
DIFF: https://github.com/llvm/llvm-project/commit/cc9495f6791aaaacbff55d7eace43550eb27e3c9.diff

LOG: [AArch64] Only mark cost 1 perfect shuffles as legal

The perfect shuffle tables encode a cost of either 0 (a nop-copy) or 1
(a single instruction) with a cost encoding of 0 in the upper 2 bits.
All perfect shuffles with any cost are then marked as legal shuffles
though (the maximum encoded cost is 3), which can confuse the DAG
combiner into thinking the shuffles are cheaper than the should be.

Limiting legal shuffles to single instructions seems to do better in
most case, producing less instructions for complex shuffles. There are
some cases that now become tbl, which may be better or worse depending
on whether the instruction is in a loop and the tbl load can be hoisted
out.

Differential Revision: https://reviews.llvm.org/D123377

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
    llvm/test/CodeGen/AArch64/build-vector-extract.ll
    llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
    llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
    llvm/test/CodeGen/AArch64/shuffles.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3a698b77f5903..9d1fb367b9869 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11489,7 +11489,9 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
-    if (Cost <= 4)
+    // The cost tables encode cost 0 or cost 1 shuffles using the value 0 in
+    // the top 2 bits.
+    if (Cost == 0)
       return true;
   }
 

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
index 7872ceffbfa71..4cd7c4a034c87 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
@@ -7,10 +7,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
-; CHECK-NEXT:    ext v1.8b, v0.8b, v1.8b, #4
-; CHECK-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    dup v0.4h, v0.h[4]
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
   ; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext

diff  --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll
index f648e6dd9b10c..998a5304a19a8 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll
@@ -30,9 +30,8 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    zip1 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
   %e = extractelement <4 x i32> %x, i32 1
   %z = zext i32 %e to i64
@@ -57,9 +56,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
 ; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov w8, v0.s[2]
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
   %e = extractelement <4 x i32> %x, i32 2
   %z = zext i32 %e to i64
@@ -110,9 +108,8 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) {
 ; CHECK-LABEL: extract0_i32_zext_insert1_i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    zip1 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    dup v0.2d, x8
 ; CHECK-NEXT:    ret
   %e = extractelement <4 x i32> %x, i32 0
   %z = zext i32 %e to i64
@@ -137,9 +134,8 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_zero(<4 x i32> %x) {
 define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) {
 ; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #4
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    dup v0.2d, x8
 ; CHECK-NEXT:    ret
   %e = extractelement <4 x i32> %x, i32 1
   %z = zext i32 %e to i64

diff  --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
index e3f9f485f20bb..041d0b40a8b8e 100644
--- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
@@ -46,9 +46,11 @@ entry:
 define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: v8i16_2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev64 v2.4h, v0.4h
-; CHECK-NEXT:    rev64 v0.4h, v1.4h
-; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

diff  --git a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
index 89e91944bc3df..845d88ebf3bd4 100644
--- a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
@@ -136,11 +136,11 @@ entry:
 define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: shuffle_widen_faili1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v2.4h, v0.4h
-; CHECK-NEXT:    rev32 v3.4h, v1.4h
-; CHECK-NEXT:    ext v1.8b, v2.8b, v1.8b, #4
-; CHECK-NEXT:    ext v0.8b, v3.8b, v0.8b, #4
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    adrp x8, .LCPI12_0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 0, i32 1,
@@ -151,11 +151,11 @@ entry:
 define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: shuffle_widen_fail2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
-; CHECK-NEXT:    trn1 v3.4h, v1.4h, v1.4h
-; CHECK-NEXT:    ext v1.8b, v2.8b, v1.8b, #4
-; CHECK-NEXT:    ext v0.8b, v3.8b, v0.8b, #4
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 6, i32 6, i32 0, i32 1,

diff  --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
index c27bae6815fec..7c9188a3311d7 100644
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@@ -4,22 +4,21 @@
 define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 v3.4s, v7.4s, v6.4s
-; CHECK-NEXT:    ext v5.16b, v6.16b, v4.16b, #12
-; CHECK-NEXT:    uzp1 v6.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v4.4s, v2.4s, v4.4s
-; CHECK-NEXT:    trn2 v3.4s, v7.4s, v3.4s
-; CHECK-NEXT:    ext v5.16b, v7.16b, v5.16b, #8
-; CHECK-NEXT:    trn2 v6.4s, v6.4s, v1.4s
-; CHECK-NEXT:    trn1 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #12
-; CHECK-NEXT:    ext v3.16b, v1.16b, v3.16b, #8
-; CHECK-NEXT:    rev64 v16.4s, v5.4s
-; CHECK-NEXT:    dup v7.4s, v7.s[0]
-; CHECK-NEXT:    ext v1.16b, v0.16b, v6.16b, #12
-; CHECK-NEXT:    mov v2.s[3], v7.s[3]
-; CHECK-NEXT:    ext v0.16b, v3.16b, v4.16b, #8
-; CHECK-NEXT:    ext v3.16b, v5.16b, v16.16b, #8
+; CHECK-NEXT:    uzp1 v16.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ext v3.16b, v6.16b, v4.16b, #12
+; CHECK-NEXT:    zip2 v6.4s, v7.4s, v6.4s
+; CHECK-NEXT:    uzp2 v17.4s, v2.4s, v4.4s
+; CHECK-NEXT:    trn2 v16.4s, v16.4s, v1.4s
+; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    trn2 v4.4s, v7.4s, v6.4s
+; CHECK-NEXT:    rev64 v5.4s, v7.4s
+; CHECK-NEXT:    trn1 v2.4s, v17.4s, v2.4s
+; CHECK-NEXT:    dup v6.4s, v7.s[0]
+; CHECK-NEXT:    mov v4.d[1], v1.d[1]
+; CHECK-NEXT:    mov v3.d[1], v5.d[1]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v16.16b, #12
+; CHECK-NEXT:    mov v2.s[3], v6.s[3]
+; CHECK-NEXT:    mov v0.16b, v4.16b
 ; CHECK-NEXT:    ret
   %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
   ret <16 x i32> %s3
@@ -29,10 +28,9 @@ define <4 x i32> @test_shuf2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip2 v0.4s, v7.4s, v6.4s
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
 ; CHECK-NEXT:    trn2 v0.4s, v7.4s, v0.4s
-; CHECK-NEXT:    ext v0.16b, v1.16b, v0.16b, #8
-; CHECK-NEXT:    ext v0.16b, v0.16b, v2.16b, #8
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
 ; CHECK-NEXT:    ret
   %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 29, i32 26, i32 7, i32 4>
   ret <4 x i32> %s3
@@ -64,10 +62,9 @@ define <4 x i32> @test_shuf4(<16 x i32> %x, <16 x i32> %y) {
 define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf5:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v1.4s, v7.4s
 ; CHECK-NEXT:    ext v0.16b, v6.16b, v4.16b, #12
-; CHECK-NEXT:    ext v0.16b, v7.16b, v0.16b, #8
-; CHECK-NEXT:    rev64 v1.4s, v0.4s
-; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
 ; CHECK-NEXT:    ret
   %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 27, i32 16, i32 31, i32 30>
   ret <4 x i32> %s3