[llvm] b18161d - [AArch64] Handle vector with two different values
Jingu Kang via llvm-commits
llvm-commits at lists.llvm.org
Fri May 5 07:04:32 PDT 2023
Author: Jingu Kang
Date: 2023-05-05T14:42:59+01:00
New Revision: b18161d7850c5102e9882649278213e226bed610
URL: https://github.com/llvm/llvm-project/commit/b18161d7850c5102e9882649278213e226bed610
DIFF: https://github.com/llvm/llvm-project/commit/b18161d7850c5102e9882649278213e226bed610.diff
LOG: [AArch64] Handle vector with two different values
If vector has two different values and it can be splitted into two sub
vectors with same length, generate two DUP and CONCAT_VECTORS/VECTOR_SHUFFLE.
For example,
t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
t24, t24, t24, t24, t24, t24, t24, t24
==>
t26: v8i8 = AArch64ISD::DUP t23
t28: v8i8 = AArch64ISD::DUP t24
t29: v16i8 = concat_vectors t26, t28
Differential Revision: https://reviews.llvm.org/D148347
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f55e269e4dd6..670781b771fb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12353,6 +12353,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
unsigned NumUndefLanes = 0;
SDValue Value;
SDValue ConstantValue;
+ SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
+ unsigned ConsecutiveValCount = 0;
+ SDValue PrevVal;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
@@ -12380,6 +12383,24 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
usesOnlyOneValue = false;
++NumDifferentLanes;
}
+
+ if (PrevVal != V) {
+ ConsecutiveValCount = 0;
+ PrevVal = V;
+ }
+
+ // Keep
diff erent values and its last consecutive count. For example,
+ //
+ // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
+ // t24, t24, t24, t24, t24, t24, t24, t24
+ // t23 = consecutive count 8
+ // t24 = consecutive count 8
+ // ------------------------------------------------------------------
+ // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
+ // t24, t24, t24, t24, t24, t24, t24, t24
+ // t23 = consecutive count 5
+ // t24 = consecutive count 9
+ DifferentValueMap[V] = ++ConsecutiveValCount;
}
if (!Value.getNode()) {
@@ -12585,6 +12606,82 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
return NewVector;
}
+ // If vector consists of two
diff erent values, try to generate two DUPs and
+ // (CONCAT_VECTORS or VECTOR_SHUFFLE).
+ if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
+ SmallVector<SDValue, 2> Vals;
+ // Check the consecutive count of the value is the half number of vector
+ // elements. In this case, we can use CONCAT_VECTORS. For example,
+ //
+ // canUseVECTOR_CONCAT = true;
+ // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
+ // t24, t24, t24, t24, t24, t24, t24, t24
+ //
+ // canUseVECTOR_CONCAT = false;
+ // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
+ // t24, t24, t24, t24, t24, t24, t24, t24
+ bool canUseVECTOR_CONCAT = true;
+ for (auto Pair : DifferentValueMap) {
+ // Check
diff erent values have same length which is NumElts / 2.
+ if (Pair.second != NumElts / 2)
+ canUseVECTOR_CONCAT = false;
+ Vals.push_back(Pair.first);
+ }
+
+ // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
+ // CONCAT_VECTORs. For example,
+ //
+ // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
+ // t24, t24, t24, t24, t24, t24, t24, t24
+ // ==>
+ // t26: v8i8 = AArch64ISD::DUP t23
+ // t28: v8i8 = AArch64ISD::DUP t24
+ // t29: v16i8 = concat_vectors t26, t28
+ if (canUseVECTOR_CONCAT) {
+ EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ if (isTypeLegal(SubVT) && SubVT.isVector() &&
+ SubVT.getVectorNumElements() >= 2) {
+ SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
+ SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
+ SDValue DUP1 =
+ LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
+ SDValue DUP2 =
+ LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
+ SDValue CONCAT_VECTORS =
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
+ return CONCAT_VECTORS;
+ }
+ }
+
+ // Let's try to generate two DUPs and VECTOR_SHUFFLE. For example,
+ //
+ // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
+ // ==>
+ // t28: v8i8 = AArch64ISD::DUP t25
+ // t30: v8i8 = AArch64ISD::DUP t26
+ // t31: v8i8 = vector_shuffle<0,0,0,0,8,8,8,8> t28, t30
+ if (NumElts >= 8) {
+ SmallVector<int, 16> MaskVec;
+ // Build mask for VECTOR_SHUFLLE.
+ SDValue FirstLaneVal = Op.getOperand(0);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Val = Op.getOperand(i);
+ if (FirstLaneVal == Val)
+ MaskVec.push_back(0);
+ else
+ MaskVec.push_back(NumElts);
+ }
+
+ SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
+ SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
+ SDValue DUP1 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops1), DAG);
+ SDValue DUP2 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops2), DAG);
+ SDValue VECTOR_SHUFFLE =
+ DAG.getVectorShuffle(VT, dl, DUP1, DUP2, MaskVec);
+ return VECTOR_SHUFFLE;
+ }
+ }
+
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
index f9880253d535..accbdd2459d4 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
@@ -4,24 +4,9 @@
define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: ldrb w8, [x1]
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: mov v0.b[9], w8
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: mov v0.b[11], w8
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: ld1r { v1.8b }, [x1]
+; CHECK-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
entry:
%0 = load i8, ptr %a, align 1
@@ -75,24 +60,9 @@ entry:
define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x1]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: mov v0.b[9], w8
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: mov v0.b[11], w8
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: ld1r { v1.8b }, [x0]
+; CHECK-NEXT: ld1r { v0.8b }, [x1]
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
entry:
%0 = load i8, ptr %a, align 1
@@ -128,17 +98,12 @@ entry:
define <8 x i8> @test6(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test6:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: ldrb w8, [x1]
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ld1r { v0.8b }, [x1]
+; CHECK-NEXT: adrp x8, .LCPI5_0
+; CHECK-NEXT: ld1r { v1.8b }, [x0]
+; CHECK-NEXT: mov v1.d[1], v0.d[0]
+; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
; CHECK-NEXT: ret
entry:
%0 = load i8, ptr %a, align 1
@@ -154,17 +119,12 @@ entry:
define <8 x i8> @test7(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test7:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x1]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-NEXT: adrp x8, .LCPI6_0
+; CHECK-NEXT: ld1r { v1.8b }, [x1]
+; CHECK-NEXT: mov v1.d[1], v0.d[0]
+; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
; CHECK-NEXT: ret
entry:
%0 = load i8, ptr %a, align 1
@@ -180,16 +140,9 @@ entry:
define <8 x i16> @test8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov v0.h[2], w8
-; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: ldrh w8, [x1]
-; CHECK-NEXT: mov v0.h[4], w8
-; CHECK-NEXT: mov v0.h[5], w8
-; CHECK-NEXT: mov v0.h[6], w8
-; CHECK-NEXT: mov v0.h[7], w8
+; CHECK-NEXT: ld1r { v1.4h }, [x1]
+; CHECK-NEXT: ld1r { v0.4h }, [x0]
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
entry:
%0 = load i16, ptr %a, align 1
@@ -205,12 +158,9 @@ entry:
define <4 x i32> @test9(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test9:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr w8, [x0]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w8
-; CHECK-NEXT: ldr w8, [x1]
-; CHECK-NEXT: mov v0.s[2], w8
-; CHECK-NEXT: mov v0.s[3], w8
+; CHECK-NEXT: ld1r { v1.2s }, [x1]
+; CHECK-NEXT: ld1r { v0.2s }, [x0]
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
entry:
%0 = load i32, ptr %a, align 1
More information about the llvm-commits
mailing list