[llvm] aca34da - Prioritize lowering V{4|16}F32 with blend.
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 24 13:22:24 PST 2023
Author: Noah Goldstein
Date: 2023-02-24T15:22:08-06:00
New Revision: aca34da46da41792614799a8b6a8b31a5a6e23d9
URL: https://github.com/llvm/llvm-project/commit/aca34da46da41792614799a8b6a8b31a5a6e23d9
DIFF: https://github.com/llvm/llvm-project/commit/aca34da46da41792614799a8b6a8b31a5a6e23d9.diff
LOG: Prioritize lowering V{4|16}F32 with blend.
Blend is often fastest available instruction so it should be higher
priority for v4f32 and an option for v16f32.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D143856
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-insert-extract.ll
llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1cbd249aaa91a..6c93e6aea3722 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15445,6 +15445,11 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ if (Subtarget.hasSSE41())
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
@@ -15498,10 +15503,6 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
if (Subtarget.hasSSE41()) {
- if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
- return Blend;
-
// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
@@ -19082,6 +19083,10 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 51e704ba303be..81e003d14cadc 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -4,14 +4,23 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s
define <16 x float> @test1(<16 x float> %x, ptr %br, float %y) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; CHECK-NEXT: vbroadcastss %xmm1, %zmm1
-; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15]
-; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: test1:
+; KNL: ## %bb.0:
+; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT: movw $16384, %ax ## imm = 0x4000
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test1:
+; SKX: ## %bb.0:
+; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT: movw $16384, %ax ## imm = 0x4000
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
%rrr = load float, ptr %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
index fdc03a705d53e..790bed4188efe 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -305,19 +305,15 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; ALL: # %bb.0:
-; ALL-NEXT: vmovups (%rdi), %zmm1
-; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
-; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
+; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
+; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; X86-AVX512F: # %bb.0:
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT: vmovups (%eax), %zmm1
-; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
-; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
+; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
+; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
; X86-AVX512F-NEXT: retl
%ptr3 = getelementptr inbounds float, ptr %ptr, i64 3
%ptrC = getelementptr inbounds float, ptr %ptr, i64 12
More information about the llvm-commits
mailing list