[llvm] 31b7d43 - [DAG] Extend extract_element(bitcast(scalar_to_vector(X))) -> trunc(srl(X,C)) (#117900)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 29 09:24:42 PST 2024
Author: Simon Pilgrim
Date: 2024-11-29T17:24:38Z
New Revision: 31b7d4333a6c10aa8b7e1a7ca5aa0e281f124ec2
URL: https://github.com/llvm/llvm-project/commit/31b7d4333a6c10aa8b7e1a7ca5aa0e281f124ec2
DIFF: https://github.com/llvm/llvm-project/commit/31b7d4333a6c10aa8b7e1a7ca5aa0e281f124ec2.diff
LOG: [DAG] Extend extract_element(bitcast(scalar_to_vector(X))) -> trunc(srl(X,C)) (#117900)
When extracting a smaller integer from a scalar_to_vector source, we were limited to only folding/truncating the lowest bits of the scalar source.
This patch extends the fold to handle extraction of any other element, by right shifting the source before truncation.
Fixes a regression from #117884
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/extract-insert.ll
llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
llvm/test/CodeGen/X86/load-partial.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 521829675ae7c3..6c8e9969784c92 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23055,18 +23055,29 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
// ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
// trunc i64 X to i32
SDValue X = BCSrc.getOperand(0);
- assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
+ EVT XVT = X.getValueType();
+ assert(XVT.isScalarInteger() && ScalarVT.isScalarInteger() &&
"Extract element and scalar to vector can't change element type "
"from FP to integer.");
unsigned XBitWidth = X.getValueSizeInBits();
- BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
+ unsigned Scale = XBitWidth / VecEltBitWidth;
+ BCTruncElt = IsLE ? 0 : Scale - 1;
// An extract element return value type can be wider than its vector
// operand element type. In that case, the high bits are undefined, so
// it's possible that we may need to extend rather than truncate.
- if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
+ if (ExtractIndex < Scale && XBitWidth > VecEltBitWidth) {
assert(XBitWidth % VecEltBitWidth == 0 &&
"Scalar bitwidth must be a multiple of vector element bitwidth");
+
+ if (ExtractIndex != BCTruncElt) {
+ unsigned ShiftIndex =
+ IsLE ? ExtractIndex : (Scale - 1) - ExtractIndex;
+ X = DAG.getNode(
+ ISD::SRL, DL, XVT, X,
+ DAG.getShiftAmountConstant(ShiftIndex * VecEltBitWidth, XVT, DL));
+ }
+
return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
}
}
diff --git a/llvm/test/CodeGen/AArch64/extract-insert.ll b/llvm/test/CodeGen/AArch64/extract-insert.ll
index 077e5f3d042df3..8c133d76ce3173 100644
--- a/llvm/test/CodeGen/AArch64/extract-insert.ll
+++ b/llvm/test/CodeGen/AArch64/extract-insert.ll
@@ -5,9 +5,8 @@
define i32 @trunc_i64_to_i32_le(i64 %x) {
; BE-LABEL: trunc_i64_to_i32_le:
; BE: // %bb.0:
-; BE-NEXT: fmov d0, x0
-; BE-NEXT: rev64 v0.4s, v0.4s
-; BE-NEXT: fmov w0, s0
+; BE-NEXT: lsr x0, x0, #32
+; BE-NEXT: // kill: def $w0 killed $w0 killed $x0
; BE-NEXT: ret
;
; LE-LABEL: trunc_i64_to_i32_le:
@@ -28,8 +27,8 @@ define i32 @trunc_i64_to_i32_be(i64 %x) {
;
; LE-LABEL: trunc_i64_to_i32_be:
; LE: // %bb.0:
-; LE-NEXT: fmov d0, x0
-; LE-NEXT: mov w0, v0.s[1]
+; LE-NEXT: lsr x0, x0, #32
+; LE-NEXT: // kill: def $w0 killed $w0 killed $x0
; LE-NEXT: ret
%ins = insertelement <2 x i64> undef, i64 %x, i32 0
%bc = bitcast <2 x i64> %ins to <4 x i32>
@@ -40,9 +39,8 @@ define i32 @trunc_i64_to_i32_be(i64 %x) {
define i16 @trunc_i64_to_i16_le(i64 %x) {
; BE-LABEL: trunc_i64_to_i16_le:
; BE: // %bb.0:
-; BE-NEXT: fmov d0, x0
-; BE-NEXT: rev64 v0.8h, v0.8h
-; BE-NEXT: umov w0, v0.h[0]
+; BE-NEXT: lsr x0, x0, #48
+; BE-NEXT: // kill: def $w0 killed $w0 killed $x0
; BE-NEXT: ret
;
; LE-LABEL: trunc_i64_to_i16_le:
@@ -63,8 +61,8 @@ define i16 @trunc_i64_to_i16_be(i64 %x) {
;
; LE-LABEL: trunc_i64_to_i16_be:
; LE: // %bb.0:
-; LE-NEXT: fmov d0, x0
-; LE-NEXT: umov w0, v0.h[3]
+; LE-NEXT: lsr x0, x0, #48
+; LE-NEXT: // kill: def $w0 killed $w0 killed $x0
; LE-NEXT: ret
%ins = insertelement <2 x i64> undef, i64 %x, i32 0
%bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -75,9 +73,7 @@ define i16 @trunc_i64_to_i16_be(i64 %x) {
define i8 @trunc_i32_to_i8_le(i32 %x) {
; BE-LABEL: trunc_i32_to_i8_le:
; BE: // %bb.0:
-; BE-NEXT: fmov s0, w0
-; BE-NEXT: rev32 v0.16b, v0.16b
-; BE-NEXT: umov w0, v0.b[0]
+; BE-NEXT: lsr w0, w0, #24
; BE-NEXT: ret
;
; LE-LABEL: trunc_i32_to_i8_le:
@@ -96,8 +92,7 @@ define i8 @trunc_i32_to_i8_be(i32 %x) {
;
; LE-LABEL: trunc_i32_to_i8_be:
; LE: // %bb.0:
-; LE-NEXT: fmov s0, w0
-; LE-NEXT: umov w0, v0.b[3]
+; LE-NEXT: lsr w0, w0, #24
; LE-NEXT: ret
%ins = insertelement <4 x i32> undef, i32 %x, i32 0
%bc = bitcast <4 x i32> %ins to <16 x i8>
@@ -115,8 +110,8 @@ define i8 @trunc_i64_to_i8_be(i64 %x) {
;
; LE-LABEL: trunc_i64_to_i8_be:
; LE: // %bb.0:
-; LE-NEXT: fmov d0, x0
-; LE-NEXT: umov w0, v0.b[7]
+; LE-NEXT: lsr x0, x0, #56
+; LE-NEXT: // kill: def $w0 killed $w0 killed $x0
; LE-NEXT: ret
%ins = insertelement <3 x i64> undef, i64 %x, i32 0
%bc = bitcast <3 x i64> %ins to <24 x i8>
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 66c884e95fa478..b52cbfe08156b7 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -358,11 +358,10 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr w8, [x0]
; CHECK-NEXT: add x9, x0, #4
; CHECK-NEXT: ld1r.4h { v0 }, [x9]
-; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: lsr w9, w8, #16
; CHECK-NEXT: strb w8, [x1]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: st1.b { v1 }[2], [x8]
; CHECK-NEXT: add x8, x1, #2
+; CHECK-NEXT: strb w9, [x1, #1]
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
index c1059be946a5fd..b6799c8a88e0cb 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
@@ -16,10 +16,8 @@ define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
;
; P9BE-LABEL: scalar_to_vector_half:
; P9BE: # %bb.0: # %entry
-; P9BE-NEXT: lxsihzx v2, 0, r3
-; P9BE-NEXT: li r3, 0
-; P9BE-NEXT: vsplth v2, v2, 3
-; P9BE-NEXT: vextublx r3, r3, v2
+; P9BE-NEXT: lhz r3, 0(r3)
+; P9BE-NEXT: srwi r3, r3, 24
; P9BE-NEXT: blr
;
; P8LE-LABEL: scalar_to_vector_half:
@@ -30,10 +28,7 @@ define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
; P8BE-LABEL: scalar_to_vector_half:
; P8BE: # %bb.0: # %entry
; P8BE-NEXT: lhz r3, 0(r3)
-; P8BE-NEXT: sldi r3, r3, 48
-; P8BE-NEXT: mtfprd f0, r3
-; P8BE-NEXT: mffprd r3, f0
-; P8BE-NEXT: rldicl r3, r3, 8, 56
+; P8BE-NEXT: srwi r3, r3, 24
; P8BE-NEXT: blr
entry:
%0 = load <2 x i8>, ptr %ad, align 1
diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll
index bec6cce889142e..dba63582ff08b1 100644
--- a/llvm/test/CodeGen/X86/load-partial.ll
+++ b/llvm/test/CodeGen/X86/load-partial.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
;
; Partial Vector Loads - PR16739
@@ -382,38 +382,24 @@ define dso_local i32 @load_partial_illegal_type() {
define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) {
; SSE-LABEL: PR43227:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: psrlq $32, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm1, 672(%rsi)
-; SSE-NEXT: movdqa %xmm0, 688(%rsi)
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, 672(%rsi)
+; SSE-NEXT: movaps %xmm1, 688(%rsi)
; SSE-NEXT: retq
;
-; AVX1-LABEL: PR43227:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, 672(%rsi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: PR43227:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: PR43227:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT: vmovaps %ymm0, 672(%rsi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%1 = getelementptr i32, ptr %explicit_0, i64 63
%2 = load <3 x i32>, ptr %1, align 1
%3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32> <i32 1, i32 2>
More information about the llvm-commits
mailing list