[llvm] b1ada7a - [DAG] Support store merging of vector constant stores (try 2)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 10 09:00:05 PDT 2023
Author: Philip Reames
Date: 2023-08-10T08:54:05-07:00
New Revision: b1ada7a1d31ee28a7c0e57268c6c6fe24931f25d
URL: https://github.com/llvm/llvm-project/commit/b1ada7a1d31ee28a7c0e57268c6c6fe24931f25d
DIFF: https://github.com/llvm/llvm-project/commit/b1ada7a1d31ee28a7c0e57268c6c6fe24931f25d.diff
LOG: [DAG] Support store merging of vector constant stores (try 2)
Original commit didn't handle the case where one of the stores was a
truncating store of the build_vector. The existing codepath produced
wrong code (which thankfully also failed asserts) instead of guarding
against unexpected types. Original commit message follows..
Ran across this when making a change to RISCV memset lowering. Seems
very odd that manually merging a store into a vector prevents it from
being further merged.
Differential Revision: https://reviews.llvm.org/D156349
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
llvm/test/CodeGen/X86/avx512-mask-op.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 374730ba15ee3e..8f1ca4909ee3e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -698,6 +698,11 @@ namespace {
case ISD::Constant:
case ISD::ConstantFP:
return StoreSource::Constant;
+ case ISD::BUILD_VECTOR:
+ if (ISD::isBuildVectorOfConstantSDNodes(StoreVal.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(StoreVal.getNode()))
+ return StoreSource::Constant;
+ return StoreSource::Unknown;
case ISD::EXTRACT_VECTOR_ELT:
case ISD::EXTRACT_SUBVECTOR:
return StoreSource::Extract;
@@ -19403,23 +19408,24 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
for (unsigned I = 0; I != NumStores; ++I) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
SDValue Val = St->getValue();
- // If constant is of the wrong type, convert it now.
+ // If constant is of the wrong type, convert it now. This comes up
+ // when one of our stores was truncating.
if (MemVT != Val.getValueType()) {
Val = peekThroughBitcasts(Val);
// Deal with constants of wrong size.
if (ElementSizeBits != Val.getValueSizeInBits()) {
- EVT IntMemVT =
- EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
- if (isa<ConstantFPSDNode>(Val)) {
+ auto *C = dyn_cast<ConstantSDNode>(Val);
+ if (!C)
// Not clear how to truncate FP values.
+ // TODO: Handle truncation of build_vector constants
return false;
- }
- if (auto *C = dyn_cast<ConstantSDNode>(Val))
- Val = DAG.getConstant(C->getAPIntValue()
- .zextOrTrunc(Val.getValueSizeInBits())
- .zextOrTrunc(ElementSizeBits),
- SDLoc(C), IntMemVT);
+ EVT IntMemVT =
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ Val = DAG.getConstant(C->getAPIntValue()
+ .zextOrTrunc(Val.getValueSizeInBits())
+ .zextOrTrunc(ElementSizeBits),
+ SDLoc(C), IntMemVT);
}
// Make sure correctly size type is the correct type.
Val = DAG.getBitcast(MemVT, Val);
@@ -19495,6 +19501,10 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
// If fp truncation is necessary give up for now.
if (MemVT.getSizeInBits() != ElementSizeBits)
return false;
+ } else if (ISD::isBuildVectorOfConstantSDNodes(Val.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(Val.getNode())) {
+ // Not yet handled
+ return false;
} else {
llvm_unreachable("Invalid constant element type");
}
@@ -19625,7 +19635,7 @@ void DAGCombiner::getStoreMergeCandidates(
case StoreSource::Constant:
if (NoTypeMatch)
return false;
- if (!isIntOrFPConstant(OtherBC))
+ if (getStoreSource(OtherBC) != StoreSource::Constant)
return false;
break;
case StoreSource::Extract:
@@ -19847,6 +19857,8 @@ bool DAGCombiner::tryStoreMergeOfConstants(
IsElementZero = C->isZero();
else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
IsElementZero = C->getConstantFPValue()->isNullValue();
+ else if (ISD::isBuildVectorAllZeros(StoredVal.getNode()))
+ IsElementZero = true;
if (IsElementZero) {
if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
FirstZeroAfterNonZero = i;
diff --git a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
index 742fead8a81d47..4730c2755acdba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
@@ -544,53 +544,31 @@ define void @bzero_32(ptr %a) nounwind {
define void @bzero_64(ptr %a) nounwind {
; RV32-LABEL: bzero_64:
; RV32: # %bb.0:
-; RV32-NEXT: addi a1, a0, 48
-; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT: li a1, 64
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: addi a1, a0, 32
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: addi a1, a0, 16
-; RV32-NEXT: vse8.v v8, (a1)
; RV32-NEXT: vse8.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: bzero_64:
; RV64: # %bb.0:
-; RV64-NEXT: addi a1, a0, 48
-; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 32
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 16
-; RV64-NEXT: vse8.v v8, (a1)
; RV64-NEXT: vse8.v v8, (a0)
; RV64-NEXT: ret
;
; RV32-FAST-LABEL: bzero_64:
; RV32-FAST: # %bb.0:
-; RV32-FAST-NEXT: addi a1, a0, 48
-; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV32-FAST-NEXT: vmv.v.i v8, 0
-; RV32-FAST-NEXT: vse64.v v8, (a1)
-; RV32-FAST-NEXT: addi a1, a0, 32
-; RV32-FAST-NEXT: vse64.v v8, (a1)
-; RV32-FAST-NEXT: addi a1, a0, 16
-; RV32-FAST-NEXT: vse64.v v8, (a1)
; RV32-FAST-NEXT: vse64.v v8, (a0)
; RV32-FAST-NEXT: ret
;
; RV64-FAST-LABEL: bzero_64:
; RV64-FAST: # %bb.0:
-; RV64-FAST-NEXT: addi a1, a0, 48
-; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-FAST-NEXT: vmv.v.i v8, 0
-; RV64-FAST-NEXT: vse64.v v8, (a1)
-; RV64-FAST-NEXT: addi a1, a0, 32
-; RV64-FAST-NEXT: vse64.v v8, (a1)
-; RV64-FAST-NEXT: addi a1, a0, 16
-; RV64-FAST-NEXT: vse64.v v8, (a1)
; RV64-FAST-NEXT: vse64.v v8, (a0)
; RV64-FAST-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 0, i64 64, i1 0)
@@ -686,27 +664,15 @@ define void @aligned_bzero_32(ptr %a) nounwind {
define void @aligned_bzero_64(ptr %a) nounwind {
; RV32-BOTH-LABEL: aligned_bzero_64:
; RV32-BOTH: # %bb.0:
-; RV32-BOTH-NEXT: addi a1, a0, 48
-; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV32-BOTH-NEXT: vmv.v.i v8, 0
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 32
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 16
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
; RV32-BOTH-NEXT: vse64.v v8, (a0)
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: aligned_bzero_64:
; RV64-BOTH: # %bb.0:
-; RV64-BOTH-NEXT: addi a1, a0, 48
-; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-BOTH-NEXT: vmv.v.i v8, 0
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 32
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 16
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
; RV64-BOTH-NEXT: vse64.v v8, (a0)
; RV64-BOTH-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 64, i1 0)
@@ -717,28 +683,16 @@ define void @aligned_bzero_66(ptr %a) nounwind {
; RV32-BOTH-LABEL: aligned_bzero_66:
; RV32-BOTH: # %bb.0:
; RV32-BOTH-NEXT: sh zero, 64(a0)
-; RV32-BOTH-NEXT: addi a1, a0, 48
-; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV32-BOTH-NEXT: vmv.v.i v8, 0
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 32
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 16
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
; RV32-BOTH-NEXT: vse64.v v8, (a0)
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: aligned_bzero_66:
; RV64-BOTH: # %bb.0:
; RV64-BOTH-NEXT: sh zero, 64(a0)
-; RV64-BOTH-NEXT: addi a1, a0, 48
-; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-BOTH-NEXT: vmv.v.i v8, 0
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 32
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 16
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
; RV64-BOTH-NEXT: vse64.v v8, (a0)
; RV64-BOTH-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 66, i1 0)
@@ -754,12 +708,8 @@ define void @aligned_bzero_96(ptr %a) nounwind {
; RV32-BOTH-NEXT: vse64.v v8, (a1)
; RV32-BOTH-NEXT: addi a1, a0, 64
; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 48
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 32
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 16
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
+; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-BOTH-NEXT: vmv.v.i v8, 0
; RV32-BOTH-NEXT: vse64.v v8, (a0)
; RV32-BOTH-NEXT: ret
;
@@ -771,12 +721,8 @@ define void @aligned_bzero_96(ptr %a) nounwind {
; RV64-BOTH-NEXT: vse64.v v8, (a1)
; RV64-BOTH-NEXT: addi a1, a0, 64
; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 48
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 32
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 16
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
+; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-BOTH-NEXT: vmv.v.i v8, 0
; RV64-BOTH-NEXT: vse64.v v8, (a0)
; RV64-BOTH-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 96, i1 0)
@@ -786,43 +732,15 @@ define void @aligned_bzero_96(ptr %a) nounwind {
define void @aligned_bzero_128(ptr %a) nounwind {
; RV32-BOTH-LABEL: aligned_bzero_128:
; RV32-BOTH: # %bb.0:
-; RV32-BOTH-NEXT: addi a1, a0, 112
-; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-BOTH-NEXT: vmv.v.i v8, 0
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 96
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 80
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 64
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 48
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 32
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 16
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
; RV32-BOTH-NEXT: vse64.v v8, (a0)
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: aligned_bzero_128:
; RV64-BOTH: # %bb.0:
-; RV64-BOTH-NEXT: addi a1, a0, 112
-; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-BOTH-NEXT: vmv.v.i v8, 0
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 96
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 80
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 64
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 48
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 32
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 16
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
; RV64-BOTH-NEXT: vse64.v v8, (a0)
; RV64-BOTH-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 128, i1 0)
@@ -832,74 +750,18 @@ define void @aligned_bzero_128(ptr %a) nounwind {
define void @aligned_bzero_256(ptr %a) nounwind {
; RV32-BOTH-LABEL: aligned_bzero_256:
; RV32-BOTH: # %bb.0:
-; RV32-BOTH-NEXT: addi a1, a0, 240
-; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-BOTH-NEXT: vmv.v.i v8, 0
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 224
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 208
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 192
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 176
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 160
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 144
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
; RV32-BOTH-NEXT: addi a1, a0, 128
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 112
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 96
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 80
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 64
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 48
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 32
-; RV32-BOTH-NEXT: vse64.v v8, (a1)
-; RV32-BOTH-NEXT: addi a1, a0, 16
+; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-BOTH-NEXT: vmv.v.i v8, 0
; RV32-BOTH-NEXT: vse64.v v8, (a1)
; RV32-BOTH-NEXT: vse64.v v8, (a0)
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: aligned_bzero_256:
; RV64-BOTH: # %bb.0:
-; RV64-BOTH-NEXT: addi a1, a0, 240
-; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-BOTH-NEXT: vmv.v.i v8, 0
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 224
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 208
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 192
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 176
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 160
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 144
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
; RV64-BOTH-NEXT: addi a1, a0, 128
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 112
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 96
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 80
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 64
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 48
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 32
-; RV64-BOTH-NEXT: vse64.v v8, (a1)
-; RV64-BOTH-NEXT: addi a1, a0, 16
+; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-BOTH-NEXT: vmv.v.i v8, 0
; RV64-BOTH-NEXT: vse64.v v8, (a1)
; RV64-BOTH-NEXT: vse64.v v8, (a0)
; RV64-BOTH-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
index 565d3588710e36..b820023c961aa1 100644
--- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -767,20 +767,61 @@ define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
}
-; Merging vector stores when sourced from a constant vector is not currently handled.
+define void @merge_vec_stores_of_zero(<4 x i32>* %ptr) {
+; CHECK-LABEL: merge_vec_stores_of_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
+ %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
+ store <4 x i32> zeroinitializer, <4 x i32>* %idx0, align 16
+ store <4 x i32> zeroinitializer, <4 x i32>* %idx1, align 16
+ ret void
+}
+
+define void @merge_vec_stores_of_constant_splat(<4 x i32>* %ptr) {
+; CHECK-LABEL: merge_vec_stores_of_constant_splat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
+; CHECK-NEXT: vmovaps %xmm0, 48(%rdi)
+; CHECK-NEXT: vmovaps %xmm0, 64(%rdi)
+; CHECK-NEXT: retq
+ %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
+ %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
+ store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %idx0, align 16
+ store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %idx1, align 16
+ ret void
+}
+
define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
; CHECK-LABEL: merge_vec_stores_of_constants:
; CHECK: # %bb.0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [25,51,45,0]
; CHECK-NEXT: vmovaps %xmm0, 48(%rdi)
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,265,26,0]
; CHECK-NEXT: vmovaps %xmm0, 64(%rdi)
; CHECK-NEXT: retq
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
- store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
- store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
+ store <4 x i32> <i32 25, i32 51, i32 45, i32 0>, <4 x i32>* %idx0, align 16
+ store <4 x i32> <i32 0, i32 265, i32 26, i32 0>, <4 x i32>* %idx1, align 16
ret void
+}
+define void @merge_vec_stores_of_constants_with_undefs(<4 x i32>* %ptr) {
+; CHECK-LABEL: merge_vec_stores_of_constants_with_undefs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
+ %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
+ store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, <4 x i32>* %idx0, align 16
+ store <4 x i32> <i32 0, i32 undef, i32 0, i32 0>, <4 x i32>* %idx1, align 16
+ ret void
}
; This is a minimized test based on real code that was failing.
@@ -855,17 +896,17 @@ define void @merge_const_store_heterogeneous(i32 %count, %struct.C* nocapture %p
; CHECK-LABEL: merge_const_store_heterogeneous:
; CHECK: # %bb.0:
; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB20_3
+; CHECK-NEXT: jle .LBB23_3
; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB20_2: # %.lr.ph
+; CHECK-NEXT: .LBB23_2: # %.lr.ph
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movq %rax, (%rsi)
; CHECK-NEXT: addq $24, %rsi
; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB20_2
-; CHECK-NEXT: .LBB20_3: # %._crit_edge
+; CHECK-NEXT: jne .LBB23_2
+; CHECK-NEXT: .LBB23_3: # %._crit_edge
; CHECK-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index d9f051c69cb237..9e689341f7b88e 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -4211,13 +4211,33 @@ entry:
}
define void @store_v128i1_constant(ptr %R) {
-; CHECK-LABEL: store_v128i1_constant:
-; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: movabsq $-4611686310485172227, %rax ## imm = 0xBFFFFFBBFFFFDFFD
-; CHECK-NEXT: movq %rax, 8(%rdi)
-; CHECK-NEXT: movabsq $-2305843576149381123, %rax ## imm = 0xDFFFFF7BFFFFEFFD
-; CHECK-NEXT: movq %rax, (%rdi)
-; CHECK-NEXT: retq
+; KNL-LABEL: store_v128i1_constant:
+; KNL: ## %bb.0: ## %entry
+; KNL-NEXT: vmovaps {{.*#+}} xmm0 = [61437,65535,65403,57343,57341,65535,65467,49151]
+; KNL-NEXT: vmovaps %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v128i1_constant:
+; SKX: ## %bb.0: ## %entry
+; SKX-NEXT: movabsq $-4611686310485172227, %rax ## imm = 0xBFFFFFBBFFFFDFFD
+; SKX-NEXT: movq %rax, 8(%rdi)
+; SKX-NEXT: movabsq $-2305843576149381123, %rax ## imm = 0xDFFFFF7BFFFFEFFD
+; SKX-NEXT: movq %rax, (%rdi)
+; SKX-NEXT: retq
+;
+; AVX512BW-LABEL: store_v128i1_constant:
+; AVX512BW: ## %bb.0: ## %entry
+; AVX512BW-NEXT: movabsq $-4611686310485172227, %rax ## imm = 0xBFFFFFBBFFFFDFFD
+; AVX512BW-NEXT: movq %rax, 8(%rdi)
+; AVX512BW-NEXT: movabsq $-2305843576149381123, %rax ## imm = 0xDFFFFF7BFFFFEFFD
+; AVX512BW-NEXT: movq %rax, (%rdi)
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: store_v128i1_constant:
+; AVX512DQ: ## %bb.0: ## %entry
+; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [61437,65535,65403,57343,57341,65535,65467,49151]
+; AVX512DQ-NEXT: vmovaps %xmm0, (%rdi)
+; AVX512DQ-NEXT: retq
;
; X86-LABEL: store_v128i1_constant:
; X86: ## %bb.0: ## %entry
More information about the llvm-commits
mailing list