[llvm] [X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions (PR #140028)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 20 02:18:19 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower.
Throughput must not regress, but allow a small increase in latency based on how much constant data we're saving (I've used a simple estimate of 1 cycle per 128-bits of data saved).
NOTE: this currently ignores hoisted constant loads where the slower instruction might be acceptable.
Fixes #<!-- -->135998
---
Patch is 75.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140028.diff
30 Files Affected:
- (modified) llvm/lib/Target/X86/X86FixupVectorConstants.cpp (+36-1)
- (modified) llvm/test/CodeGen/X86/avgceils.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/avgfloors.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/avx512-build-vector.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/combine-or-shuffle.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/combine-or.ll (+6-11)
- (modified) llvm/test/CodeGen/X86/constant-pool-sharing.ll (+2-4)
- (modified) llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll (+22-8)
- (modified) llvm/test/CodeGen/X86/fpclamptosat_vec.ll (+10-6)
- (modified) llvm/test/CodeGen/X86/icmp-pow2-mask.ll (+13-27)
- (modified) llvm/test/CodeGen/X86/kmov.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/machine-combiner-int-vec.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/min-legal-vector-width.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/pr30290.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/pr38639.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/pr46532.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/pr57340.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/pr63108.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/pr77459.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/recip-fastmath.ll (+106-22)
- (modified) llvm/test/CodeGen/X86/recip-fastmath2.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/slow-pmulld.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/sqrt-fastmath.ll (+16-9)
- (modified) llvm/test/CodeGen/X86/sse-domains.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/trunc-vector-width.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll (+17-17)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining.ll (+5-5)
- (modified) llvm/test/CodeGen/X86/vector-trunc-usat.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index a415a45775984..5d3b49b733615 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -348,6 +348,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();
bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
+ bool OptSize = MF.getFunction().hasOptSize();
struct FixupEntry {
int Op;
@@ -356,6 +357,36 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
RebuildConstant;
};
+
+ auto NewOpcPreferable = [&](const FixupEntry &Fixup,
+ unsigned RegBitWidth) -> bool {
+ if (SM->hasInstrSchedModel()) {
+ unsigned NewOpc = Fixup.Op;
+ auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
+ auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
+ unsigned BitsSaved = RegBitWidth - (Fixup.NumCstElts * Fixup.MemBitWidth);
+
+ // Compare tput/lat - avoid any regressions, but allow extra cycle of
+ // latency in exchange for each 128-bit (or less) constant pool reduction
+ // (this is a very simple cost:benefit estimate - there will probably be
+ // better ways to calculate this).
+ double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
+ double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
+ if (OldTput != NewTput)
+ return NewTput < OldTput;
+
+ int LatTol = (BitsSaved + 127) / 128;
+ int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
+ int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
+ if (OldLat != NewLat)
+ return NewLat < (OldLat + LatTol);
+ }
+
+ // We either were unable to get tput/lat or all values were equal.
+ // Prefer the new opcode for reduced constant pool size.
+ return true;
+ };
+
auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
unsigned OperandNo) {
#ifdef EXPENSIVE_CHECKS
@@ -372,7 +403,11 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
for (const FixupEntry &Fixup : Fixups) {
- if (Fixup.Op) {
+ // Always uses the smallest possible constant load with opt/minsize,
+ // otherwise use the smallest instruction that doesn't affect
+ // performance.
+ // TODO: If constant has been hoisted from loop, use smallest constant.
+ if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = Fixup.RebuildConstant(
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index bb6f3fc9f588a..70b2d4be2fd81 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -39,7 +39,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -82,7 +82,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -365,7 +365,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -416,7 +416,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -875,7 +875,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
@@ -946,7 +946,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index 2566860357130..0508e5ccb5430 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -52,7 +52,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -107,7 +107,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -404,7 +404,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -477,7 +477,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -965,7 +965,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
@@ -1077,7 +1077,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b21a0c4e36c2b..55478a2e93154 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 55b1cdeddb853..95b5fcf8eac52 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -836,7 +836,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: or_and_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,15,7]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
; AVX512-NEXT: retq
%1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 08262e4d34b26..8c91274abf3dd 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -29,16 +29,11 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
; SSE-NEXT: retq
;
-; AVX1-LABEL: or_zext_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: or_zext_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm0 = mem[0,0]
-; AVX2-NEXT: retq
+; AVX-LABEL: or_zext_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: retq
%1 = zext <2 x i32> %a0 to <2 x i64>
%2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
ret <2 x i64> %2
@@ -261,7 +256,7 @@ define i64 @PR89533(<64 x i8> %a0) {
;
; AVX2-LABEL: PR89533:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index 9e0114c017419..be835f78292e5 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -105,8 +105,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-LINUX-LABEL: store_repeated_constants:
; AVX-LINUX: # %bb.0:
-; AVX-LINUX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-LINUX-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-LINUX-NEXT: vmovaps %ymm0, (%rdi)
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-LINUX-NEXT: vmovaps %xmm0, %xmm1
@@ -119,8 +118,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-MSVC-LABEL: store_repeated_constants:
; AVX-MSVC: # %bb.0:
-; AVX-MSVC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-MSVC-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-MSVC-NEXT: vmovaps %ymm0, (%rcx)
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-MSVC-NEXT: vmovaps %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 59a61722927de..5519d9b787b7f 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -389,7 +389,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
;
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -649,12 +649,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fdiv_pow2_8xhalf:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0
-; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
-; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-NO-FASTFMA: # %bb.0:
+; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT: retq
+;
+; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -1135,7 +1149,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 4ac829c0dfd53..1a2cfd69650b8 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -198,7 +198,7 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind {
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -576,7 +576,8 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -1023,7 +1024,8 @@ define <4 x i32> @ustest_f...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/140028
More information about the llvm-commits
mailing list