[llvm] 14f54a5 - [DAG][AArch64] Fold shuffle_vector<4,5,6,7> to extract_subvector

Fri Jun 30 03:13:44 PDT 2023

Author: David Green
Date: 2023-06-30T11:13:39+01:00
New Revision: 14f54a594e5b5e215844f03e598805bd8c377003

URL: https://github.com/llvm/llvm-project/commit/14f54a594e5b5e215844f03e598805bd8c377003
DIFF: https://github.com/llvm/llvm-project/commit/14f54a594e5b5e215844f03e598805bd8c377003.diff

LOG: [DAG][AArch64] Fold shuffle_vector<4,5,6,7> to extract_subvector

During legalization, we can end up with shuffles that are identity masks, so
act like extract_subvector, but do not simplify to extract_subvector. This
adjusts the profitability heuristic in foldExtractSubvectorFromShuffleVector to
allow identity vectors that do not start at element 0. Undef masks elements are
excluded as it can be more useful to keep the undef elements.

Differential Revision: https://reviews.llvm.org/D153504

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/add-extract.ll
    llvm/test/CodeGen/AArch64/shuffles.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8c0b3831694e8c..ada1176610d434 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23720,10 +23720,6 @@ static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
       continue;
     }
 
-    // Profitability check: only deal with extractions from the first subvector.
-    if (OpSubvecIdx != 0)
-      return SDValue();
-
     const std::pair<SDValue, int> DemandedSubvector =
         std::make_pair(Op, OpSubvecIdx);
 
@@ -23753,6 +23749,14 @@ static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
   if (DemandedSubvectors.empty())
     return DAG.getUNDEF(NarrowVT);
 
+  // Profitability check: only deal with extractions from the first subvector
+  // unless the mask becomes an identity mask.
+  if (!ShuffleVectorInst::isIdentityMask(NewMask) ||
+      any_of(NewMask, [](int M) { return M < 0; }))
+    for (auto &DemandedSubvector : DemandedSubvectors)
+      if (DemandedSubvector.second != 0)
+        return SDValue();
+
   // We still perform the exact same EXTRACT_SUBVECTOR,  just on 
diff erent
   // operand[s]/index[es], so there is no point in checking for it's legality.
 

diff  --git a/llvm/test/CodeGen/AArch64/add-extract.ll b/llvm/test/CodeGen/AArch64/add-extract.ll
index 7d2366898dd2bc..58b833529cc640 100644
--- a/llvm/test/CodeGen/AArch64/add-extract.ll
+++ b/llvm/test/CodeGen/AArch64/add-extract.ll
@@ -83,9 +83,9 @@ define i32 @add_i32_ext_load(<1 x i32> %A, ptr %B) nounwind {
 define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: add_i64_ext_ext_test1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    add d0, d0, d1
-; CHECK-NEXT:    dup v1.2d, v1.d[1]
-; CHECK-NEXT:    add d0, d0, d1
+; CHECK-NEXT:    add d0, d0, d2
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %a = extractelement <1 x i64> %A, i32 0
@@ -99,9 +99,9 @@ define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
 define i64 @sub_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: sub_i64_ext_ext_test1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    sub d0, d0, d1
-; CHECK-NEXT:    dup v1.2d, v1.d[1]
-; CHECK-NEXT:    sub d0, d0, d1
+; CHECK-NEXT:    sub d0, d0, d2
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %a = extractelement <1 x i64> %A, i32 0

diff  --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
index 9b908dc0be0a97..7be836af6342a2 100644
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@@ -262,3 +262,16 @@ define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
   %r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 2, i32 7, i32 2, i32 0, i32 3, i32 2, i32 15>
   ret <8 x half> %r
 }
+
+define <4 x i32> @extract_shuffle(<8 x i16> %j, <4 x i16> %k) {
+; CHECK-LABEL: extract_shuffle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #3
+; CHECK-NEXT:    ret
+  %a = shufflevector <8 x i16> %j, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %b = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %c = zext <4 x i16> %b to <4 x i32>
+  %d = shl <4 x i32> %c, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %d
+}
+

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index b9080ed84bbed3..fd0811bbf6580b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -56,36 +56,36 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    mov v1.b[5], w10
 ; CHECK-NEXT:    umov w10, v0.b[14]
 ; CHECK-NEXT:    mov v2.b[5], w8
-; CHECK-NEXT:    mov x8, #16
+; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    mov v1.b[6], w9
-; CHECK-NEXT:    mov x9, #24
+; CHECK-NEXT:    mov x9, #24 // =0x18
 ; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    mov v2.b[6], w10
 ; CHECK-NEXT:    umov w10, v0.b[15]
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT:    dup v3.2d, v0.d[1]
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    mov v1.b[7], w11
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    mov v2.b[7], w10
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
+; CHECK-NEXT:    mov x11, #8 // =0x8
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    and z0.s, z0.s, #0x1
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    mov x11, #8
-; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    lsl z3.s, z3.s, #31
-; CHECK-NEXT:    asr z0.s, z0.s, #31
-; CHECK-NEXT:    asr z3.s, z3.s, #31
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    asr z0.s, z3.s, #31
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
-; CHECK-NEXT:    and z3.s, z3.s, #0x1
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x11, lsl #2]
-; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
+; CHECK-NEXT:    cmpne p2.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    lsl z2.s, z2.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
@@ -96,12 +96,12 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    mov z5.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z2.s, #0
-; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z5.s }, p0, [x0, x9, lsl #2]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x11, lsl #2]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x0]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x0, x11, lsl #2]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer