[llvm] [InstCombine] Only fold extract element to trunc if vector `hasOneUse` (PR #115627)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 9 17:47:31 PST 2024
https://github.com/peterbell10 created https://github.com/llvm/llvm-project/pull/115627
This fixes a missed optimization caused by the `foldBitcastExtElt` pattern
interfering with other combine patterns. In the case I was hitting, we have IR
that combines two vectors into a new larger vector by extracting elements and
inserting them into the new vector.
```llvm
define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
%avec = bitcast i32 %a to <2 x half>
%a0 = extractelement <2 x half> %avec, i32 0
%a1 = extractelement <2 x half> %avec, i32 1
%bvec = bitcast i32 %b to <2 x half>
%b0 = extractelement <2 x half> %bvec, i32 0
%b1 = extractelement <2 x half> %bvec, i32 1
%ins0 = insertelement <4 x half> undef, half %a0, i32 0
%ins1 = insertelement <4 x half> %ins0, half %a1, i32 1
%ins2 = insertelement <4 x half> %ins1, half %b0, i32 2
%ins3 = insertelement <4 x half> %ins2, half %b1, i32 3
ret <4 x half> %ins3
}
```
With the current behavior, `InstCombine` converts each vector extract sequence to
```llvm
%tmp = trunc i32 %a to i16
%a0 = bitcast i16 %tmp to half
%a1 = extractelement <2 x half> %avec, i32 1
```
where the extraction of `%a0` is now done by truncating the original integer.
While on it's own this is fairly reasonable, in this case it also blocks the
pattern which converts `extractelement` - `insertelement` into shuffles
which gives the overall simpler result:
```llvm
define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
%avec = bitcast i32 %a to <2 x half>
%bvec = bitcast i32 %b to <2 x half>
%ins3 = shufflevector <2 x half> %avec, <2 x half> %bvec, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x half> %ins3
}
```
In this PR I fix the conflict by obeying the `hasOneUse` check even if
there is no shift instruction required. In these cases we can't remove
the vector completely, so the pattern has less benefit anyway.
Also fwiw, I think dropping the `hasOneUse` check for the 0th element
might have been a mistake in the first place. Looking at
https://github.com/llvm/llvm-project/commit/535c5d56a7bc9966036a11362d8984983a4bf090 the commit message only mentions
loosening the `isDesirableIntType` requirement and doesn't mention changing
the `hasOneUse` check at all.
>From 149078c6807da8f0670123a855f7b6c173675e9b Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10 at openai.com>
Date: Sun, 10 Nov 2024 00:00:11 +0000
Subject: [PATCH] [InstCombine] Only fold extract element to trunc if vector
hasOneUse
This fixes a missed optimization caused by the `foldBitcastExtElt` pattern
interfering with other combine patterns. In the case I was hitting, we have IR
that combines two vectors into a new larger vector by extracting elements and
inserting them into the new vector.
```llvm
define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
%avec = bitcast i32 %a to <2 x half>
%a0 = extractelement <2 x half> %avec, i32 0
%a1 = extractelement <2 x half> %avec, i32 1
%bvec = bitcast i32 %b to <2 x half>
%b0 = extractelement <2 x half> %bvec, i32 0
%b1 = extractelement <2 x half> %bvec, i32 1
%ins0 = insertelement <4 x half> undef, half %a0, i32 0
%ins1 = insertelement <4 x half> %ins0, half %a1, i32 1
%ins2 = insertelement <4 x half> %ins1, half %b0, i32 2
%ins3 = insertelement <4 x half> %ins2, half %b1, i32 3
ret <4 x half> %ins3
}
```
With the current behavior, `InstCombine` converts each vector extract sequence to
```llvm
%tmp = trunc i32 %a to i16
%a0 = bitcast i16 %tmp to half
%a1 = extractelement <2 x half> %avec, i32 1
```
where the extraction of `%a0` is now done by truncating the original integer.
While on it's own this is fairly reasonable, in this case it also blocks the
pattern which converts `extractelement` - `insertelement` into shuffles
which gives the overall simpler result:
```llvm
define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
%avec = bitcast i32 %a to <2 x half>
%bvec = bitcast i32 %b to <2 x half>
%ins3 = shufflevector <2 x half> %avec, <2 x half> %bvec, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x half> %ins3
}
```
In this PR I fix the conflict by obeying the `hasOneUse` check even if
there is no shift instruction required. In these cases we can't remove
the vector completely, so the pattern has less benefit anyway.
Also fwiw, I think dropping the `hasOneUse` check for the 0th element
might have been a mistake in the first place. Looking at
535c5d56a7bc9966036a11362d8984983a4bf090 the commit message only mentions
loosening the `isDesirableIntType` requirement and doesn't mention changing
the `hasOneUse` check at all.
---
.../InstCombine/InstCombineVectorOps.cpp | 5 ++---
.../Transforms/InstCombine/extractelement.ll | 18 +++++----------
.../InstCombine/vector_insertelt_shuffle.ll | 22 +++++++++++++++++++
3 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 454fe5a91d375a..7d1d053edd6153 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -205,9 +205,8 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
if (IsBigEndian)
ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC;
unsigned ShiftAmountC = ExtIndexC * DestWidth;
- if (!ShiftAmountC ||
- (isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) &&
- Ext.getVectorOperand()->hasOneUse())) {
+ if ((!ShiftAmountC || isDesirableIntType(X->getType()->getPrimitiveSizeInBits())) &&
+ Ext.getVectorOperand()->hasOneUse()) {
if (ShiftAmountC)
X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset");
if (DestTy->isFloatingPointTy()) {
diff --git a/llvm/test/Transforms/InstCombine/extractelement.ll b/llvm/test/Transforms/InstCombine/extractelement.ll
index 28a4702559c46c..2bd719e2361379 100644
--- a/llvm/test/Transforms/InstCombine/extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/extractelement.ll
@@ -722,20 +722,14 @@ define i8 @bitcast_scalar_index_variable(i32 %x, i64 %y) {
ret i8 %r
}
-; extra use is ok if we don't need a shift
+; extra use is not ok, even if we don't need a shift
define i8 @bitcast_scalar_index0_use(i64 %x) {
-; ANYLE-LABEL: @bitcast_scalar_index0_use(
-; ANYLE-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8>
-; ANYLE-NEXT: call void @use(<8 x i8> [[V]])
-; ANYLE-NEXT: [[R:%.*]] = trunc i64 [[X]] to i8
-; ANYLE-NEXT: ret i8 [[R]]
-;
-; ANYBE-LABEL: @bitcast_scalar_index0_use(
-; ANYBE-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8>
-; ANYBE-NEXT: call void @use(<8 x i8> [[V]])
-; ANYBE-NEXT: [[R:%.*]] = extractelement <8 x i8> [[V]], i64 0
-; ANYBE-NEXT: ret i8 [[R]]
+; ANY-LABEL: @bitcast_scalar_index0_use(
+; ANY-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8>
+; ANY-NEXT: call void @use(<8 x i8> [[V]])
+; ANY-NEXT: [[R:%.*]] = extractelement <8 x i8> [[V]], i64 0
+; ANY-NEXT: ret i8 [[R]]
;
%v = bitcast i64 %x to <8 x i8>
diff --git a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
index f745a403642115..ab2a7faa107c79 100644
--- a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
@@ -90,4 +90,26 @@ define <4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
ret <4 x float> %ins1
}
+; test that foldBitcastExtElt doesn't interfere with shuffle folding
+
+define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
+; CHECK-LABEL: @bitcast_extract_insert_to_shuffle(
+; CHECK-NEXT: [[AVEC:%.*]] = bitcast i32 [[A:%.*]] to <2 x half>
+; CHECK-NEXT: [[BVEC:%.*]] = bitcast i32 [[B:%.*]] to <2 x half>
+; CHECK-NEXT: [[INS3:%.*]] = shufflevector <2 x half> [[AVEC]], <2 x half> [[BVEC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <4 x half> [[INS3]]
+;
+ %avec = bitcast i32 %a to <2 x half>
+ %a0 = extractelement <2 x half> %avec, i32 0
+ %a1 = extractelement <2 x half> %avec, i32 1
+ %bvec = bitcast i32 %b to <2 x half>
+ %b0 = extractelement <2 x half> %bvec, i32 0
+ %b1 = extractelement <2 x half> %bvec, i32 1
+ %ins0 = insertelement <4 x half> undef, half %a0, i32 0
+ %ins1 = insertelement <4 x half> %ins0, half %a1, i32 1
+ %ins2 = insertelement <4 x half> %ins1, half %b0, i32 2
+ %ins3 = insertelement <4 x half> %ins2, half %b1, i32 3
+ ret <4 x half> %ins3
+}
+
More information about the llvm-commits
mailing list