[llvm] cd68e17 - [AArch64] Add support for efficient bitcast in vector truncate store.
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 28 03:19:50 PDT 2023
Author: Lawrence Benson
Date: 2023-04-28T11:19:45+01:00
New Revision: cd68e17bc2f9b7b54a3d3ab5f917793d41ce17cb
URL: https://github.com/llvm/llvm-project/commit/cd68e17bc2f9b7b54a3d3ab5f917793d41ce17cb
DIFF: https://github.com/llvm/llvm-project/commit/cd68e17bc2f9b7b54a3d3ab5f917793d41ce17cb.diff
LOG: [AArch64] Add support for efficient bitcast in vector truncate store.
Following the changes in D145301, we now also support the efficient bitcast
when storing the bool vector. Previously, this was expanded.
Differential Revision: https://reviews.llvm.org/D148316
Added:
llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
llvm/test/CodeGen/AArch64/vec_uaddo.ll
llvm/test/CodeGen/AArch64/vec_umulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1b51140fe1d1..1fb9833c8cbe 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19775,20 +19775,25 @@ static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue ComparisonResult(N, 0);
- EVT BoolVecVT = ComparisonResult.getValueType();
- assert(BoolVecVT.isVector() && "Must be a vector type");
+ EVT VecVT = ComparisonResult.getValueType();
+ assert(VecVT.isVector() && "Must be a vector type");
- unsigned NumElts = BoolVecVT.getVectorNumElements();
+ unsigned NumElts = VecVT.getVectorNumElements();
if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
return SDValue();
+ if (VecVT.getVectorElementType() != MVT::i1 &&
+ !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
+ return SDValue();
+
// If we can find the original types to work on instead of a vector of i1,
// we can avoid extend/extract conversion instructions.
- EVT VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
- if (!VecVT.isSimple()) {
- unsigned BitsPerElement = std::max(64 / NumElts, 8u); // min. 64-bit vector
- VecVT =
- BoolVecVT.changeVectorElementType(MVT::getIntegerVT(BitsPerElement));
+ if (VecVT.getVectorElementType() == MVT::i1) {
+ VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
+ if (!VecVT.isSimple()) {
+ unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
+ VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
+ }
}
VecVT = VecVT.changeVectorElementTypeToInteger();
@@ -19849,6 +19854,37 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
}
+static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
+ StoreSDNode *Store) {
+ if (!Store->isTruncatingStore())
+ return SDValue();
+
+ SDLoc DL(Store);
+ SDValue VecOp = Store->getValue();
+ EVT VT = VecOp.getValueType();
+ EVT MemVT = Store->getMemoryVT();
+
+ if (!MemVT.isVector() || !VT.isVector() ||
+ MemVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ // If we are storing a vector that we are currently building, let
+ // `scalarizeVectorStore()` handle this more efficiently.
+ if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
+ return SDValue();
+
+ VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
+ SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
+ if (!VectorBits)
+ return SDValue();
+
+ EVT StoreVT =
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits());
+ SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
+ return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
+ Store->getMemOperand());
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -19887,6 +19923,9 @@ static SDValue performSTORECombine(SDNode *N,
if (SDValue Store = foldTruncStoreOfExt(DAG, N))
return Store;
+ if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
+ return Store;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
index cf7b9b173905..e482833ffe45 100644
--- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
+++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
@@ -1,9 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) {
; CHECK-LABEL: test_mismatched_setcc:
-; CHECK: cmeq [[CMP128:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK: xtn {{v[0-9]+}}.4h, [[CMP128]].4s
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.4s, #63, msl #16
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
%tst = icmp eq <4 x i22> %l, %r
store <4 x i1> %tst, ptr %addr
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index e1daead54c6f..49380820beb3 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -418,18 +418,59 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
ret i4 %bitmask
}
-; TODO(lawben): Change this in follow-up patch to #D145301, as truncating stores fix this.
-; Larger vector types don't map directly.
-define i8 @no_convert_large_vector(<8 x i32> %vec) {
+; Larger vector types don't map directly, but the can be split/truncated and then converted.
+; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
+define i8 @convert_large_vector(<8 x i32> %vec) {
+; CHECK-LABEL: lCPI15_0:
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 8
+; CHECK-NEXT: .short 16
+; CHECK-NEXT: .short 32
+; CHECK-NEXT: .short 64
+; CHECK-NEXT: .short 128
+
; CHECK-LABEL: convert_large_vector:
-; CHECK: cmeq.4s v1, v1, #0
-; CHECK-NOT: addv
+; CHECK: Lloh30:
+; CHECK-NEXT: adrp x8, lCPI15_0 at PAGE
+; CHECK-NEXT: cmeq.4s v1, v1, #0
+; CHECK-NEXT: cmeq.4s v0, v0, #0
+; CHECK-NEXT: uzp1.8h v0, v0, v1
+; CHECK-NEXT: Lloh31:
+; CHECK-NEXT: ldr q1, [x8, lCPI15_0 at PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addv.8h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
%cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
%bitmask = bitcast <8 x i1> %cmp_result to i8
ret i8 %bitmask
}
+define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
+; CHECK-LABEL: convert_legalized_illegal_element_size
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movi.4s v1, #63, msl #16
+; CHECK-NEXT: Lloh32:
+; CHECK-NEXT: adrp x8, lCPI16_0 at PAGE
+; CHECK-NEXT: cmtst.4s v0, v0, v1
+; CHECK-NEXT: Lloh33:
+; CHECK-NEXT: ldr d1, [x8, lCPI16_0 at PAGEOFF]
+; CHECK-NEXT: xtn.4h v0, v0
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: addv.4h h0, v0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
+ %bitmask = bitcast <4 x i1> %cmp_result to i4
+ ret i4 %bitmask
+}
+
; This may still be converted as a v8i8 after the vector concat (but not as v4iX).
define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
; CHECK-LABEL: no_direct_convert_for_bad_concat:
@@ -450,3 +491,12 @@ define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
%cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
ret <8 x i1> %cmp_result
}
+
+define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
+; CHECK-LABEL: no_combine_illegal_num_elements
+; CHECK-NOT: addv
+
+ %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
+ %bitmask = bitcast <6 x i1> %cmp_result to i6
+ ret i6 %bitmask
+}
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
new file mode 100644
index 000000000000..dc878090d321
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -0,0 +1,281 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+
+define void @store_16_elements(<16 x i8> %vec, ptr %out) {
+; Bits used in mask
+; CHECK-LABEL: lCPI0_0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 64
+; CHECK-NEXT: .byte 128
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 64
+; CHECK-NEXT: .byte 128
+
+; Actual conversion
+; CHECK-LABEL: store_16_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh0:
+; CHECK-NEXT: adrp x8, lCPI0_0 at PAGE
+; CHECK-NEXT: cmeq.16b v0, v0, #0
+; CHECK-NEXT: Lloh1:
+; CHECK-NEXT: ldr q1, [x8, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: ext.16b v1, v0, v0, #8
+; CHECK-NEXT: addv.8b b0, v0
+; CHECK-NEXT: addv.8b b1, v1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: orr w8, w9, w8, lsl #8
+; CHECK-NEXT: strh w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
+ store <16 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_8_elements(<8 x i16> %vec, ptr %out) {
+; CHECK-LABEL: lCPI1_0:
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 8
+; CHECK-NEXT: .short 16
+; CHECK-NEXT: .short 32
+; CHECK-NEXT: .short 64
+; CHECK-NEXT: .short 128
+
+; CHECK-LABEL: store_8_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh2:
+; CHECK-NEXT: adrp x8, lCPI1_0 at PAGE
+; CHECK-NEXT: cmeq.8h v0, v0, #0
+; CHECK-NEXT: Lloh3:
+; CHECK-NEXT: ldr q1, [x8, lCPI1_0 at PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addv.8h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
+ store <8 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_4_elements(<4 x i32> %vec, ptr %out) {
+; CHECK-LABEL: lCPI2_0:
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .long 8
+
+; CHECK-LABEL: store_4_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh4:
+; CHECK-NEXT: adrp x8, lCPI2_0 at PAGE
+; CHECK-NEXT: cmeq.4s v0, v0, #0
+; CHECK-NEXT: Lloh5:
+; CHECK-NEXT: ldr q1, [x8, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addv.4s s0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
+ store <4 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_2_elements(<2 x i64> %vec, ptr %out) {
+; CHECK-LABEL: lCPI3_0:
+; CHECK-NEXT: .quad 1
+; CHECK-NEXT: .quad 2
+
+; CHECK-LABEL: store_2_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh6:
+; CHECK-NEXT: adrp x8, lCPI3_0 at PAGE
+; CHECK-NEXT: cmeq.2d v0, v0, #0
+; CHECK-NEXT: Lloh7:
+; CHECK-NEXT: ldr q1, [x8, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addp.2d d0, v0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
+ store <2 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) {
+; CHECK-LABEL: lCPI4_0:
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .long 8
+
+; CHECK-LABEL: add_trunc_compare_before_store
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh8:
+; CHECK-NEXT: adrp x8, lCPI4_0 at PAGE
+; CHECK-NEXT: shl.4s v0, v0, #31
+; CHECK-NEXT: cmlt.4s v0, v0, #0
+; CHECK-NEXT: Lloh9:
+; CHECK-NEXT: ldr q1, [x8, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT: and.16b v0, v0, v1
+; CHECK-NEXT: addv.4s s0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %trunc = trunc <4 x i32> %vec to <4 x i1>
+ store <4 x i1> %trunc, ptr %out
+ ret void
+}
+
+define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) {
+; CHECK-LABEL: lCPI5_0:
+; CHECK: .short 1
+; CHECK: .short 2
+; CHECK: .short 4
+; CHECK: .short 8
+
+; CHECK-LABEL: add_trunc_mask_unknown_vector_type
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh10:
+; CHECK-NEXT: adrp x8, lCPI5_0 at PAGE
+; CHECK-NEXT: shl.4h v0, v0, #15
+; CHECK-NEXT: cmlt.4h v0, v0, #0
+; CHECK-NEXT: Lloh11:
+; CHECK-NEXT: ldr d1, [x8, lCPI5_0 at PAGEOFF]
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: addv.4h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ store <4 x i1> %vec, ptr %out
+ ret void
+}
+
+define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) {
+; CHECK-LABEL: lCPI6_0:
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 64
+; CHECK-NEXT: .byte 128
+
+; CHECK-LABEL: store_8_elements_64_bit_vector
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh12:
+; CHECK-NEXT: adrp x8, lCPI6_0 at PAGE
+; CHECK-NEXT: cmeq.8b v0, v0, #0
+; CHECK-NEXT: Lloh13:
+; CHECK-NEXT: ldr d1, [x8, lCPI6_0 at PAGEOFF]
+; CHECK-NEXT: bic.8b v0, v1, v0
+; CHECK-NEXT: addv.8b b0, v0
+; CHECK-NEXT: st1.b { v0 }[0], [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <8 x i8> %vec, zeroinitializer
+ store <8 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) {
+; CHECK-LABEL: lCPI7_0:
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 8
+
+; CHECK-LABEL: store_4_elements_64_bit_vector
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh14:
+; CHECK-NEXT: adrp x8, lCPI7_0 at PAGE
+; CHECK-NEXT: cmeq.4h v0, v0, #0
+; CHECK-NEXT: Lloh15:
+; CHECK-NEXT: ldr d1, [x8, lCPI7_0 at PAGEOFF]
+; CHECK-NEXT: bic.8b v0, v1, v0
+; CHECK-NEXT: addv.4h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <4 x i16> %vec, zeroinitializer
+ store <4 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) {
+; CHECK-LABEL: lCPI8_0:
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+
+; CHECK-LABEL: store_2_elements_64_bit_vector
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh16:
+; CHECK-NEXT: adrp x8, lCPI8_0 at PAGE
+; CHECK-NEXT: cmeq.2s v0, v0, #0
+; CHECK-NEXT: Lloh17:
+; CHECK-NEXT: ldr d1, [x8, lCPI8_0 at PAGEOFF]
+; CHECK-NEXT: bic.8b v0, v1, v0
+; CHECK-NEXT: addp.2s v0, v0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
+ store <2 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @no_combine_without_truncate(<16 x i8> %vec, ptr %out) {
+; CHECK-LABEL: no_combine_without_truncate
+; CHECK: cmtst.16b v0, v0, v0
+; CHECK-NOT: addv.8b b0, v0
+
+ %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
+ %extended_result = sext <16 x i1> %cmp_result to <16 x i8>
+ store <16 x i8> %extended_result, ptr %out
+ ret void
+}
+
+define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
+; CHECK-LABEL: no_combine_for_non_bool_truncate
+; CHECK: xtn.4h v0, v0
+; CHECK-NOT: addv.4s s0, v0
+
+ %trunc = trunc <4 x i32> %vec to <4 x i8>
+ store <4 x i8> %trunc, ptr %out
+ ret void
+}
+
+define void @no_combine_for_build_vector(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: no_combine_for_build_vector
+; CHECK-NOT: addv
+
+ %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+ %2 = insertelement <4 x i1> %1, i1 %b, i64 1
+ %3 = insertelement <4 x i1> %2, i1 %c, i64 2
+ %vec = insertelement <4 x i1> %3, i1 %d, i64 3
+ store <4 x i1> %vec, ptr %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 4ccc2c642a0d..a4e1c801d98e 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -246,22 +246,20 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v4i1:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.4h, #1
+; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: umov w8, v0.h[0]
-; CHECK-NEXT: umov w9, v0.h[1]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: and v1.8b, v0.8b, v2.8b
-; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: and w8, w8, #0x1
-; CHECK-NEXT: bfi w8, w9, #1, #1
+; CHECK-NEXT: shl v1.4h, v0.4h, #15
+; CHECK-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEXT: cmeq v0.4h, v2.4h, v0.4h
+; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: mvn v0.8b, v0.8b
-; CHECK-NEXT: bfi w8, w10, #2, #1
-; CHECK-NEXT: orr w8, w8, w11, lsl #3
-; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: addv h1, v1.4h
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index a66cb6a7e568..e40f477eebcf 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -296,18 +296,15 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-LABEL: umulo_v4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d2, d0
+; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addv h1, v0.4h
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: umov w9, v1.h[1]
-; CHECK-NEXT: umov w10, v1.h[2]
-; CHECK-NEXT: umov w11, v1.h[3]
-; CHECK-NEXT: and w8, w8, #0x1
-; CHECK-NEXT: bfi w8, w9, #1, #1
-; CHECK-NEXT: bfi w8, w10, #2, #1
-; CHECK-NEXT: orr w8, w8, w11, lsl #3
-; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
More information about the llvm-commits
mailing list