[llvm] 28b4123 - [InterleaveAccessPass] Handle multi-use binop shuffles
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 10 09:24:42 PDT 2022
Author: David Green
Date: 2022-07-10T17:24:37+01:00
New Revision: 28b41237e6b296bf777d2f0c13c48031525fcdc4
URL: https://github.com/llvm/llvm-project/commit/28b41237e6b296bf777d2f0c13c48031525fcdc4
DIFF: https://github.com/llvm/llvm-project/commit/28b41237e6b296bf777d2f0c13c48031525fcdc4.diff
LOG: [InterleaveAccessPass] Handle multi-use binop shuffles
D89489 added some logic to the interleaved access pass to attempt to
undo the folding of shuffles into binops, that instcombine performs. If
early-cse is run too, the binops may be commoned into a single operation
with multiple shuffle uses. It is still profitable reverse the transform
though, so long as all the uses are shuffles.
Differential Revision: https://reviews.llvm.org/D129419
Added:
Modified:
llvm/lib/CodeGen/InterleavedAccessPass.cpp
llvm/test/CodeGen/AArch64/vldn_shuffle.ll
llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index b3f38a3b53f3..55f3ad796291 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -310,10 +310,11 @@ bool InterleavedAccess::lowerInterleavedLoad(
Extracts.push_back(Extract);
continue;
}
- auto *BI = dyn_cast<BinaryOperator>(User);
- if (BI && BI->hasOneUse()) {
- if (auto *SVI = dyn_cast<ShuffleVectorInst>(*BI->user_begin())) {
- BinOpShuffles.insert(SVI);
+ if (auto *BI = dyn_cast<BinaryOperator>(User)) {
+ if (all_of(BI->users(),
+ [](auto *U) { return isa<ShuffleVectorInst>(U); })) {
+ for (auto *SVI : BI->users())
+ BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI));
continue;
}
}
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 775878122d61..d72dcd5ca05e 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -188,13 +188,10 @@ define void @vld2_multiuse(float* nocapture readonly %pSrc, float* noalias nocap
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB4_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q1, q0, [x0], #32
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: str q0, [x1, x8]
+; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32
+; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s
+; CHECK-NEXT: str q2, [x1, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT: b.ne .LBB4_1
@@ -230,25 +227,11 @@ define void @vld3_multiuse(float* nocapture readonly %pSrc, float* noalias nocap
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB5_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q0, q1, [x0]
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: ldr q3, [x0, #32]
-; CHECK-NEXT: add x0, x0, #48
-; CHECK-NEXT: mov v2.16b, v0.16b
-; CHECK-NEXT: mov v2.s[1], v0.s[3]
-; CHECK-NEXT: rev64 v4.4s, v1.4s
-; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s
-; CHECK-NEXT: mov v2.s[2], v1.s[2]
-; CHECK-NEXT: mov v4.s[0], v0.s[1]
-; CHECK-NEXT: mov v1.s[0], v0.s[2]
-; CHECK-NEXT: mov v2.s[3], v3.s[1]
-; CHECK-NEXT: mov v4.s[3], v3.s[2]
-; CHECK-NEXT: mov v1.s[2], v3.s[0]
-; CHECK-NEXT: fadd v0.4s, v4.4s, v2.4s
-; CHECK-NEXT: mov v1.s[3], v3.s[3]
-; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: str q0, [x1, x8]
+; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
+; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s
+; CHECK-NEXT: str q3, [x1, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT: b.ne .LBB5_1
@@ -286,31 +269,15 @@ define void @vld4_multiuse(float* nocapture readonly %pSrc, float* noalias nocap
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB6_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q1, q0, [x0, #32]
+; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
+; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s
; CHECK-NEXT: add x9, x1, x8
; CHECK-NEXT: add x8, x8, #32
; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: ldp q3, q2, [x0], #64
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s
-; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT: zip2 v5.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v16.4s, v3.4s, v2.4s
-; CHECK-NEXT: ext v6.16b, v1.16b, v4.16b, #8
-; CHECK-NEXT: trn2 v7.4s, v3.4s, v2.4s
-; CHECK-NEXT: mov v1.s[3], v0.s[2]
-; CHECK-NEXT: zip1 v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: zip2 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp2 v3.4s, v16.4s, v3.4s
-; CHECK-NEXT: mov v7.d[1], v4.d[1]
-; CHECK-NEXT: mov v0.d[1], v6.d[1]
-; CHECK-NEXT: mov v2.d[1], v1.d[1]
-; CHECK-NEXT: mov v3.d[1], v5.d[1]
-; CHECK-NEXT: fadd v0.4s, v7.4s, v0.4s
-; CHECK-NEXT: fadd v1.4s, v3.4s, v2.4s
-; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x9]
+; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s
+; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s
+; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT: b.ne .LBB6_1
; CHECK-NEXT: // %bb.2: // %while.end
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
index 770c056779aa..3dcdca362015 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
@@ -286,30 +286,10 @@ define void @arm_cmplx_mag_squared_f16_cse(half* nocapture readonly %pSrc, half*
; CHECK-NEXT: and r5, r2, #7
; CHECK-NEXT: .LBB2_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q0, [r0], #32
+; CHECK-NEXT: vld20.16 {q0, q1}, [r0]
+; CHECK-NEXT: vld21.16 {q0, q1}, [r0]!
; CHECK-NEXT: vmul.f16 q0, q0, q0
-; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vmul.f16 q2, q2, q2
-; CHECK-NEXT: vmovx.f16 s5, s2
-; CHECK-NEXT: vins.f16 s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s3
-; CHECK-NEXT: vins.f16 s5, s6
-; CHECK-NEXT: vmovx.f16 s6, s8
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vmovx.f16 s7, s10
-; CHECK-NEXT: vins.f16 s6, s12
-; CHECK-NEXT: vmovx.f16 s12, s11
-; CHECK-NEXT: vins.f16 s2, s3
-; CHECK-NEXT: vins.f16 s10, s11
-; CHECK-NEXT: vins.f16 s8, s9
-; CHECK-NEXT: vins.f16 s0, s1
-; CHECK-NEXT: vmov.f32 s1, s2
-; CHECK-NEXT: vins.f16 s7, s12
-; CHECK-NEXT: vmov.f32 s2, s8
-; CHECK-NEXT: vmov.f32 s3, s10
-; CHECK-NEXT: vadd.f16 q0, q1, q0
+; CHECK-NEXT: vfma.f16 q0, q1, q1
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: le lr, .LBB2_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
@@ -413,8 +393,7 @@ define void @arm_cmplx_mag_squared_f32_cse(float* nocapture readonly %pSrc, floa
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: beq .LBB3_8
+; CHECK-NEXT: cbz r2, .LBB3_8
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: cmp r2, #4
; CHECK-NEXT: blo .LBB3_9
@@ -435,19 +414,10 @@ define void @arm_cmplx_mag_squared_f32_cse(float* nocapture readonly %pSrc, floa
; CHECK-NEXT: and r5, r2, #3
; CHECK-NEXT: .LBB3_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q1, [r0], #32
+; CHECK-NEXT: vld20.32 {q0, q1}, [r0]
+; CHECK-NEXT: vld21.32 {q0, q1}, [r0]!
; CHECK-NEXT: vmul.f32 q0, q0, q0
-; CHECK-NEXT: vmul.f32 q1, q1, q1
-; CHECK-NEXT: vmov.f32 s8, s4
-; CHECK-NEXT: vmov.f32 s9, s6
-; CHECK-NEXT: vmov.f32 s4, s5
-; CHECK-NEXT: vmov.f32 s5, s7
-; CHECK-NEXT: vmov.f32 s10, s0
-; CHECK-NEXT: vmov.f32 s11, s2
-; CHECK-NEXT: vmov.f32 s6, s1
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vadd.f32 q0, q1, q2
+; CHECK-NEXT: vfma.f32 q0, q1, q1
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: le lr, .LBB3_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
More information about the llvm-commits
mailing list