[llvm] [X86] Use X86FixupInstTunings to select between (V)MOVSS/D and (V)BLENDPS/D (PR #143312)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 8 08:11:46 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: 黃國庭 (houngkoungting)
<details>
<summary>Changes</summary>
Fix #<!-- -->142588
Following @<!-- -->RKSimon’s suggestion, the transformation applies only when the blend mask is exactly 1, indicating that the instruction behaves like a move. Additionally, the conversion will only be performed when optimizing for size or when the target prefers MOVSS/D over BLENDPS/D for performance reasons.
The switch-case instructions were identified with GPT O.O .
---
Patch is 106.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143312.diff
55 Files Affected:
- (modified) llvm/lib/Target/X86/X86FixupInstTuning.cpp (+33)
- (modified) llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/avx-insertelt.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/avx512-intrinsics.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/avx512copy-intrinsics.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/build-vector-512.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/buildvec-extract.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/coalesce_commute_movsd.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/combine-and.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/combine-or-shuffle.ll (+60-20)
- (modified) llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/fmsubadd-combine.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll (+7-7)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll (+7-7)
- (modified) llvm/test/CodeGen/X86/half-constrained.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/half-darwin.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/insertelement-zero.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/masked_expandload.ll (+7-7)
- (modified) llvm/test/CodeGen/X86/masked_gather.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/masked_gather_scatter.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/masked_load.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/oddsubvector.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/pr40730.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/scalarize-fp.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/sse-insertelt.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/sse2.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/sse41.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vec-strict-128-fp16.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll (+10-10)
- (modified) llvm/test/CodeGen/X86/vec_extract-avx.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vec_floor.ll (+34-34)
- (modified) llvm/test/CodeGen/X86/vec_ss_load_fold.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-blend.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-half-conversions.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll (+5-5)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll (+10-22)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll (+7-7)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-zmov.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vselect.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 6bb7600dedcac..748ebcc8a5569 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -221,8 +221,41 @@ bool X86FixupInstTuningPass::processInstruction(
auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
return ProcessUNPCKToIntDomain(NewOpc);
};
+
+ auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
+ if (!MI.getOperand(NumOperands - 1).isImm() ||
+ MI.getOperand(NumOperands - 1).getImm() != 1)
+ return false;
+
+ bool Force = MF.getFunction().hasOptSize();
+ if (!Force && !NewOpcPreferable(MovOpc))
+ return false;
+ MI.setDesc(TII->get(MovOpc));
+ MI.removeOperand(NumOperands - 1);
+ return true;
+ };
switch (Opc) {
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPSYrri:
+ case X86::VBLENDMPSZ128rrkz:
+ case X86::VBLENDMPSZ256rrkz:
+ case X86::VBLENDMPSZrrkz: {
+ int Imm = MI.getOperand(NumOperands - 1).getImm();
+ if (Imm != 1)
+ return false;
+ return ProcessBLENDToMOV(X86::VMOVSSrr);
+ }
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPDYrri:
+ case X86::VBLENDMPDZ128rrkz:
+ case X86::VBLENDMPDZ256rrkz:
+ case X86::VBLENDMPDZrrkz: {
+ int Imm = MI.getOperand(NumOperands - 1).getImm();
+ if (Imm != 1)
+ return false;
+ return ProcessBLENDToMOV(X86::VMOVSDrr);
+ }
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 254a53fcac4de..65273870c3dfb 100644
--- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -11,7 +11,7 @@ define void @endless_loop() {
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX1-NEXT: vmovaps %ymm0, (%eax)
; AVX1-NEXT: vmovaps %ymm1, (%eax)
; AVX1-NEXT: vzeroupper
@@ -21,7 +21,7 @@ define void @endless_loop() {
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%eax), %xmm0
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 18ca01290c914..81f3058f19579 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -8,7 +8,7 @@ define <8 x float> @insert_f32_firstelt_of_low_subvector(<8 x float> %x, float %
; ALL-LABEL: insert_f32_firstelt_of_low_subvector:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; ALL-NEXT: vmovss {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; ALL-NEXT: retq
%i0 = insertelement <8 x float> %x, float %s, i32 0
ret <8 x float> %i0
@@ -94,7 +94,7 @@ define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float
; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
@@ -202,9 +202,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
; AVX-LABEL: insert_f32_firstelts:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a8574c0b7516c..30bf1a261f4b7 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1843,7 +1843,7 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
; X86-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
-; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-NEXT: retl
;
; X64-LABEL: test_mm_cvtu64_sd:
@@ -1891,7 +1891,7 @@ define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
; X86-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index c1ef500d9d3de..aae48aba93be6 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -10483,7 +10483,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; CHECK-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%q = load float, ptr %ptr_b
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 926af4e9957af..f9b5994a18d36 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6505,7 +6505,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: ret{{[l|q]}}
%q = load float, ptr %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a7ca23792e6fe..a2af7df44010e 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
; NOAVX512MOVZXC: # %bb.0:
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 789196c5e4848..69d17fe3ab69f 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -578,7 +578,7 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
; AVX-32-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,17,0,0]
; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
-; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
; AVX-32-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-32-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm2[0]
; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
@@ -626,7 +626,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
; AVX-32-NEXT: vbroadcastss (%ecx), %xmm1
; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
; AVX-32-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-32-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
; AVX-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
@@ -640,7 +640,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
; AVX-64-NEXT: vbroadcastss (%rdi), %xmm1
; AVX-64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-64-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-64-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-64-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 545c57fed4b2c..9d856ed7647ca 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -42,7 +42,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; AVX-LABEL: extract0_i32_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%z = zext i32 %e to i64
@@ -85,7 +85,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64
@@ -130,7 +130,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 556b0deaf4c83..8b3aa2964db02 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -51,7 +51,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX512-NEXT: retq
@@ -149,7 +149,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
@@ -235,12 +235,12 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2
; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512-NEXT: vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index b42fd957d7f4f..086df87d1d5ff 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -44,12 +44,12 @@ define <4 x float> @insert_f32(float %a0, <4 x float> %a1) {
;
; AVX-LABEL: insert_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: insert_f32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: retq
%1 = insertelement <4 x float> %a1, float %a0, i32 0
ret <4 x float> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index e5594dc9c5e3c..173457ff46677 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -37,7 +37,7 @@ define <4 x i32> @test1(<4 x i32> %A) {
; AVX-LABEL: test1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
ret <4 x i32> %1
@@ -195,7 +195,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
; AVX-LABEL: test11:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 95b5fcf8eac52..137c3d9dec7bd 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -86,10 +86,20 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test4:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
@@ -108,10 +118,20 @@ define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test5:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test5:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test5:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test5:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
@@ -241,10 +261,20 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test11:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test11:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test11:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
%or = or <4 x i32> %and1, %and2
@@ -263,10 +293,20 @@ define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test12:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test12:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test12:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
%or = or <4 x i32> %and1, %and2
@@ -395,18 +435,18 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; AVX1-LABEL: test18:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test18:
; AVX2: # %bb.0:
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 33bc93d0fe4db..95d350d45d901 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1343,7 +1343,7 @@ define <2 x double> @test_fminimumnum_vector_nan...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/143312
More information about the llvm-commits
mailing list