[llvm] [AArch64] Prevent generating tbl instruction instead of smull (PR #106375)
Igor Kirillov via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 01:53:03 PDT 2024
https://github.com/igogo-x86 updated https://github.com/llvm/llvm-project/pull/106375
>From 46451b4cbcae7aef7875cd7a2b1077bd1f57eceb Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Thu, 22 Aug 2024 14:41:31 +0000
Subject: [PATCH 1/2] [AArch64] Prevent generating tbl instruction instead of
smull
Generating tbl instruction for zext in an expression like:
mul(zext(i8), sext) is not optimal.
Instead, allowing later optimisations to generate smull(zext, sext)
would do some of the type extensions implicitly and be faster.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 10 ++
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 148 ++++++++++++++++++
2 files changed, 158 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 215f30128e7038..e2986d42da956b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16795,6 +16795,16 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
DstTy = TruncDstType;
}
+
+ // mul(zext(i8), sext) can be transformed into smull(zext, sext) when
+ // destination type is at least i32, which is faster than using tbl
+ // instructions
+ if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
+ auto *SingleUser = cast<Instruction>(*I->user_begin());
+ if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
+ return false;
+ }
+
IRBuilder<> Builder(ZExt);
Value *Result = createTblShuffleForZExt(
Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 66bb131ce72494..bb98a7c1dcb0e8 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -3154,3 +3154,151 @@ loop:
exit:
ret i32 0
}
+
+; Not profitable to convert when there is an opportunity to partially
+; reduce the cost of widening by generating smul
+define i32 @mul_zext_16i8_sext_16i8(ptr %p1, ptr %p2, i32 %h) {
+; CHECK-LABEL: mul_zext_16i8_sext_16i8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: LBB27_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1], #16
+; CHECK-NEXT: subs w2, w2, #1
+; CHECK-NEXT: sshll2.8h v2, v0, #0
+; CHECK-NEXT: ushll2.8h v3, v1, #0
+; CHECK-NEXT: sshll.8h v0, v0, #0
+; CHECK-NEXT: ushll.8h v1, v1, #0
+; CHECK-NEXT: smull2.4s v4, v2, v3
+; CHECK-NEXT: smull.4s v2, v2, v3
+; CHECK-NEXT: smull.4s v3, v0, v1
+; CHECK-NEXT: smull2.4s v0, v0, v1
+; CHECK-NEXT: stp q2, q4, [x0, #32]
+; CHECK-NEXT: str q3, [x0]
+; CHECK-NEXT: str q0, [x0, #16]!
+; CHECK-NEXT: b.ne LBB27_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: mul_zext_16i8_sext_16i8:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: .LBB27_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x1]
+; CHECK-BE-NEXT: add x8, x0, #48
+; CHECK-BE-NEXT: subs w2, w2, #1
+; CHECK-BE-NEXT: add x1, x1, #16
+; CHECK-BE-NEXT: sshll2 v2.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll2 v3.8h, v1.16b, #0
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-BE-NEXT: smull2 v4.4s, v2.8h, v3.8h
+; CHECK-BE-NEXT: smull v2.4s, v2.4h, v3.4h
+; CHECK-BE-NEXT: smull v3.4s, v0.4h, v1.4h
+; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h
+; CHECK-BE-NEXT: st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #32
+; CHECK-BE-NEXT: st1 { v3.4s }, [x0]
+; CHECK-BE-NEXT: add x0, x0, #16
+; CHECK-BE-NEXT: st1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x0]
+; CHECK-BE-NEXT: b.ne .LBB27_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: mov w0, wzr
+; CHECK-BE-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i32 %iv
+ %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
+ %l1 = load <16 x i8>, ptr %gep.1
+ %z1 = sext <16 x i8> %l1 to <16 x i32>
+ %l4 = load <16 x i8>, ptr %gep.2
+ %z5 = zext <16 x i8> %l4 to <16 x i32>
+ %mul = mul <16 x i32> %z1, %z5
+ store <16 x i32> %mul, ptr %gep.1
+ %iv.next= add nuw nsw i32 %iv, 1
+ %exitcond.not = icmp eq i32 %iv.next, %h
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret i32 0
+}
+
+; Same as above but different type
+define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
+; CHECK-LABEL: mul_zext_16i8_sext_16i16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: LBB28_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x1], #16
+; CHECK-NEXT: ldr q3, [x0]
+; CHECK-NEXT: ldr q2, [x8, #16]!
+; CHECK-NEXT: subs w2, w2, #1
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: smull2.4s v4, v2, v1
+; CHECK-NEXT: smull.4s v1, v2, v1
+; CHECK-NEXT: smull2.4s v2, v3, v0
+; CHECK-NEXT: smull.4s v0, v3, v0
+; CHECK-NEXT: stp q1, q4, [x0, #32]
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str q2, [x8]
+; CHECK-NEXT: b.ne LBB28_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: mul_zext_16i8_sext_16i16:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: .LBB28_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x1]
+; CHECK-BE-NEXT: ld1 { v1.8h }, [x0]
+; CHECK-BE-NEXT: add x8, x0, #16
+; CHECK-BE-NEXT: ld1 { v3.8h }, [x8]
+; CHECK-BE-NEXT: add x9, x0, #48
+; CHECK-BE-NEXT: add x10, x0, #32
+; CHECK-BE-NEXT: subs w2, w2, #1
+; CHECK-BE-NEXT: add x1, x1, #16
+; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-BE-NEXT: smull v4.4s, v1.4h, v2.4h
+; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h
+; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h
+; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h
+; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
+; CHECK-BE-NEXT: mov x0, x8
+; CHECK-BE-NEXT: st1 { v5.4s }, [x9]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: b.ne .LBB28_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: mov w0, wzr
+; CHECK-BE-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i32 %iv
+ %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
+ %l1 = load <16 x i16>, ptr %gep.1
+ %z1 = sext <16 x i16> %l1 to <16 x i32>
+ %l4 = load <16 x i8>, ptr %gep.2
+ %z5 = zext <16 x i8> %l4 to <16 x i32>
+ %mul = mul <16 x i32> %z1, %z5
+ store <16 x i32> %mul, ptr %gep.1
+ %iv.next= add nuw nsw i32 %iv, 1
+ %exitcond.not = icmp eq i32 %iv.next, %h
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret i32 0
+}
>From 9fe5348b42a1d4127a9db4f7fc0254e11cf55ef3 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Thu, 29 Aug 2024 08:50:39 +0000
Subject: [PATCH 2/2] Update comment
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e2986d42da956b..8e721914577bae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16797,7 +16797,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
}
// mul(zext(i8), sext) can be transformed into smull(zext, sext) when
- // destination type is at least i32, which is faster than using tbl
+ // destination type is at least SrcWidth * 4, which is faster than using tbl
// instructions
if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
More information about the llvm-commits
mailing list