[llvm] [AArch64] Remove EXT instr before UZP when extracting elements from vector (PR #91328)

Tue May 7 06:54:00 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: None (Lukacma)

<details>
<summary>Changes</summary>

Assembly generated  for getting odd/even elements from vector contained extra EXT instruction. This was due to way llvm constructs DAGs when vector_shuffling from larger type to smaller. This patch optimises DAG in these situations, allowing for correct assembly to be emitted. 

---
Full diff: https://github.com/llvm/llvm-project/pull/91328.diff


7 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+22) 
- (modified) llvm/test/CodeGen/AArch64/aarch64-vuzp.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll (+9-9) 
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll (+14-15) 
- (modified) llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll (+7-15) 
- (modified) llvm/test/CodeGen/AArch64/neon-perm.ll (+3-3) 
- (modified) llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll (+24-24) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2af679e0755b54..4cf3c4fb007faf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21454,6 +21454,27 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   EVT ResVT = N->getValueType(0);
 
+  // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
+  if(Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR && 
+     Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+     Op0.getOperand(0) == Op1.getOperand(0)){
+  
+    SDValue SourceVec = Op0.getOperand(0);
+    uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
+    uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
+    uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
+    if(ExtIdx0 == 0 && ExtIdx1 == NumElements/2){
+      EVT OpVT = Op0.getOperand(1).getValueType();
+      EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+      SDValue uzp2 = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec, SourceVec);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, uzp2, DAG.getConstant(0, DL, OpVT));
+    }
+  }
+
+  // following optimization only work with uzp1
+  if (N->getOpcode() == AArch64ISD::UZP2)
+    return SDValue();
+
   // uzp1(x, undef) -> concat(truncate(x), undef)
   if (Op1.getOpcode() == ISD::UNDEF) {
     EVT BCVT = MVT::Other, HalfVT = MVT::Other;
@@ -24670,6 +24691,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::UUNPKHI:
     return performUnpackCombine(N, DAG, Subtarget);
   case AArch64ISD::UZP1:
+  case AArch64ISD::UZP2:
     return performUzpCombine(N, DAG, Subtarget);
   case AArch64ISD::SETCC_MERGE_ZERO:
     return performSetccMergeZeroCombine(N, DCI);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
index ba1ad9ba989c6f..4f76a2dd5947a7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -3,7 +3,7 @@
 declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
 
 ; CHECK-LABEL: fun1:
-; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 define i32 @fun1() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
@@ -14,7 +14,7 @@ entry:
 }
 
 ; CHECK-LABEL: fun2:
-; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 define i32 @fun2() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index dae8d9f89e9954..3157257083e52d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -7,18 +7,18 @@ target triple = "aarch64"
 define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
 ; CHECK-LABEL: complex_add_v4f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uzp1 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp1 z4.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index c09ec616b015df..15e9b903db5066 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -7,23 +7,22 @@ target triple = "aarch64"
 define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
 ; CHECK-LABEL: complex_mul_v4f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp2 z4.d, z0.d, z2.d
-; CHECK-NEXT:    uzp1 z0.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
-; CHECK-NEXT:    movprfx z5, z2
-; CHECK-NEXT:    fmul z5.h, p0/m, z5.h, z0.h
-; CHECK-NEXT:    fmul z2.h, p0/m, z2.h, z4.h
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    fmla z3.h, p0/m, z1.h, z4.h
-; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    movprfx z4, z3
+; CHECK-NEXT:    fmul z4.h, p0/m, z4.h, z0.h
+; CHECK-NEXT:    fmul z3.h, p0/m, z3.h, z2.h
+; CHECK-NEXT:    fmad z2.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    zip2 z1.d, z0.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index c58db8290c87ab..5bd680ed489389 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -30,21 +30,13 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 }
 
 define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) {
-; CHECK-SD-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    fmov d0, d2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
-; CHECK-GI-NEXT:    uzp2 v1.8h, v0.8h, v0.8h
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-GI-NEXT:    fmov d0, d2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
+; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v0.8h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
   %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 26ffa2727a1cd1..645eb6b8f430e8 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -3888,9 +3888,9 @@ entry:
 define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
 ; CHECK-LABEL: test_uzp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    uzp1 v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    uzp2 v1.8b, v0.8b, v1.8b
+; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v0.16b
+; CHECK-NEXT:    uzp2 v1.16b, v0.16b, v0.16b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    fmov d0, d2
 ; CHECK-NEXT:    ret
 
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 478f4a689d3c7a..fd1365d56fee47 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -4,10 +4,10 @@
 define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
+; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z2.s
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>}   %retval
@@ -16,10 +16,10 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z2.h
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>}   %retval
@@ -39,10 +39,10 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
+; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z2.s
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>}   %retval
@@ -131,10 +131,10 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i1(<vscale x 16 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p1.h
-; CHECK-NEXT:    uzp2 p1.h, p2.h, p1.h
+; CHECK-NEXT:    uzp1 p1.b, p0.b, p0.b
+; CHECK-NEXT:    uzp2 p2.b, p0.b, p0.b
+; CHECK-NEXT:    punpklo p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
   ret {<vscale x 8 x i1>, <vscale x 8 x i1>}   %retval
@@ -143,10 +143,10 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
 define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1(<vscale x 8 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    uzp1 p0.s, p2.s, p1.s
-; CHECK-NEXT:    uzp2 p1.s, p2.s, p1.s
+; CHECK-NEXT:    uzp1 p1.h, p0.h, p0.h
+; CHECK-NEXT:    uzp2 p2.h, p0.h, p0.h
+; CHECK-NEXT:    punpklo p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
   ret {<vscale x 4 x i1>, <vscale x 4 x i1>}   %retval
@@ -155,10 +155,10 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
 define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1(<vscale x 4 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    uzp1 p0.d, p2.d, p1.d
-; CHECK-NEXT:    uzp2 p1.d, p2.d, p1.d
+; CHECK-NEXT:    uzp1 p1.s, p0.s, p0.s
+; CHECK-NEXT:    uzp2 p2.s, p0.s, p0.s
+; CHECK-NEXT:    punpklo p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
   ret {<vscale x 2 x i1>, <vscale x 2 x i1>}   %retval

``````````

</details>


https://github.com/llvm/llvm-project/pull/91328