[llvm] 13c359a - [X86] ReplaceNodeResults - truncate sub-128-bit vectors as shuffles directly (#83120)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 27 07:04:04 PST 2024
Author: Simon Pilgrim
Date: 2024-02-27T15:03:42Z
New Revision: 13c359aa9b5eae0715494505d989cfc8ed1d39cf
URL: https://github.com/llvm/llvm-project/commit/13c359aa9b5eae0715494505d989cfc8ed1d39cf
DIFF: https://github.com/llvm/llvm-project/commit/13c359aa9b5eae0715494505d989cfc8ed1d39cf.diff
LOG: [X86] ReplaceNodeResults - truncate sub-128-bit vectors as shuffles directly (#83120)
We were scalarizing these truncations, but in most cases we can widen the source vector to 128-bits and perform the truncation as a shuffle directly (which will usually lower as a PACK or PSHUFB).
For the cases where the widening and shuffle isn't legal we can leave it to generic legalization to scalarize for us.
Fixes #81883
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/extract-concat.ll
llvm/test/CodeGen/X86/vec_anyext.ll
llvm/test/CodeGen/X86/vec_cast.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a86f13135173b0..a134ca32c66bd3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32341,20 +32341,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
}
- if (128 % InBits == 0) {
+ if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
// 128 bit and smaller inputs should avoid truncate all together and
- // just use a build_vector that will become a shuffle.
- // TODO: Widen and use a shuffle directly?
- SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
- // Use the original element count so we don't do more scalar opts than
- // necessary.
- for (unsigned i=0; i < MinElts; ++i) {
- SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
- DAG.getIntPtrConstant(i, dl));
- Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
- }
- Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
- return;
+ // use a shuffle.
+ if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
+ int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
+ SmallVector<int, 16> TruncMask(WidenNumElts, -1);
+ for (unsigned I = 0; I < MinElts; ++I)
+ TruncMask[I] = Scale * I;
+ SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
+ assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
+ "Illegal vector type in truncation");
+ WidenIn = DAG.getBitcast(WidenVT, WidenIn);
+ Results.push_back(
+ DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
+ return;
+ }
}
// With AVX512 there are some cases that can use a target specific
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index 93dbe99882fe0b..e7415dcf229f40 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) {
; SSE2-LABEL: foo:
; SSE2: # %bb.0:
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT: shll $8, %ecx
-; SSE2-NEXT: orl %eax, %ecx
-; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: orl %ecx, %eax
-; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; SSE2-NEXT: movl %eax, (%rdi)
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movd %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: foo:
; SSE42: # %bb.0:
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movl $255, %eax
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
; SSE42-NEXT: movd %xmm0, (%rdi)
@@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) {
; AVX-LABEL: foo:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: movl $255, %eax
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index 09e4a4b3a773d1..e229165be967a5 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
;
; X64-LABEL: func_8_16:
; X64: # %bb.0:
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrl $16, %ecx
-; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; X64-NEXT: shrq $48, %rax
-; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; X64-NEXT: movq (%rsi), %rax
-; X64-NEXT: vmovd %eax, %xmm1
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrl $16, %ecx
-; X64-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; X64-NEXT: shrq $48, %rax
-; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: retq
%F = load <4 x i16>, ptr %a
%G = trunc <4 x i16> %F to <4 x i8>
diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll
index e0089354cc9530..0a6bc2f59b685b 100644
--- a/llvm/test/CodeGen/X86/vec_cast.ll
+++ b/llvm/test/CodeGen/X86/vec_cast.ll
@@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
; CHECK-WIN-LABEL: h:
; CHECK-WIN: # %bb.0:
; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0
-; CHECK-WIN-NEXT: movl (%rcx), %eax
+; CHECK-WIN-NEXT: movd %xmm0, %eax
; CHECK-WIN-NEXT: pextrw $2, %xmm0, %edx
; CHECK-WIN-NEXT: pextrw $4, %xmm0, %ecx
; CHECK-WIN-NEXT: # kill: def $ax killed $ax killed $eax
More information about the llvm-commits
mailing list