[llvm] [InstCombine] Do not combine shuffle+bitcast if the bitcast is eliminable. (PR #135769)

Tue Apr 29 08:11:03 PDT 2025

https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/135769

>From 23c096c371c3cbd1ff891ff24d1ca1e31f241bf7 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 14 Apr 2025 07:58:07 -0700
Subject: [PATCH 1/4] Precommit test.

---
 .../Transforms/InstCombine/shufflevec-bitcast.ll | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index f20077243273c..ea1527566dc98 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -235,3 +235,19 @@ define <3 x i4> @shuf_bitcast_wrong_size(<2 x i8> %v, i8 %x) {
   %r = shufflevector <4 x i4> %b, <4 x i4> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i4> %r
 }
+
+; Negative test - chain of bitcasts.
+
+define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
+; CHECK-LABEL: @shuf_bitcast_chain(
+; CHECK-NEXT:    [[S_BC:%.*]] = bitcast <8 x i32> [[V:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[S_EXTRACT:%.*]] = extractelement <2 x i128> [[S_BC]], i64 0
+; CHECK-NEXT:    [[C:%.*]] = bitcast i128 [[S_EXTRACT]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[C]]
+;
+  %s = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = bitcast <4 x i32> %s to <2 x i64>
+  %b = bitcast <2 x i64> %a to i128
+  %c = bitcast i128 %b to <16 x i8>
+  ret <16 x i8> %c
+}

>From 9d9d5853df3150978e36ece6335b2c03ae2935d7 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 11 Apr 2025 08:56:45 -0700
Subject: [PATCH 2/4] [InstCombine] Do not combine shuffle+bitcast if the
 bitcast is eliminable.

If we are attempting to combine shuffle+bitcast but the bitcast is
pairable with a subsequent bitcast, we should not fold the shuffle as
doing so can block further simplifications.

The motivation for this is a long standing regression affecting SIMDe on
AArch64 introduced indirectly by the alwaysinliner (1a2e77cf). Examples
of reproducers:
* https://godbolt.org/z/53qx18s6M
* https://godbolt.org/z/o5e43h5M7
---
 .../InstCombine/InstCombineVectorOps.cpp         | 16 ++++++++++++----
 .../Transforms/InstCombine/shufflevec-bitcast.ll |  5 ++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index f897cc7855d2d..f6423cb40492e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3029,10 +3029,18 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     SmallVector<BitCastInst *, 8> BCs;
     DenseMap<Type *, Value *> NewBCs;
     for (User *U : SVI.users())
-      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
-        if (!BC->use_empty())
-          // Only visit bitcasts that weren't previously handled.
-          BCs.push_back(BC);
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) {
+        // Only visit bitcasts that weren't previously handled.
+        if (BC->use_empty())
+          continue;
+        // Prefer to combine bitcasts of bitcasts before attempting this fold.
+        if (BC->hasOneUse()) {
+          auto *BC2 = dyn_cast<BitCastInst>(BC->user_back());
+          if (BC2 && isEliminableCastPair(BC, BC2))
+            continue;
+        }
+        BCs.push_back(BC);
+      }
     for (BitCastInst *BC : BCs) {
       unsigned BegIdx = Mask.front();
       Type *TgtTy = BC->getDestTy();
diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index ea1527566dc98..c6152368f06fd 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -240,9 +240,8 @@ define <3 x i4> @shuf_bitcast_wrong_size(<2 x i8> %v, i8 %x) {
 
 define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
 ; CHECK-LABEL: @shuf_bitcast_chain(
-; CHECK-NEXT:    [[S_BC:%.*]] = bitcast <8 x i32> [[V:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[S_EXTRACT:%.*]] = extractelement <2 x i128> [[S_BC]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i128 [[S_EXTRACT]] to <16 x i8>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[C:%.*]] = bitcast <4 x i32> [[S]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[C]]
 ;
   %s = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

>From cbdcf1fbdcc330616610ee75d849b713a7e7e9f8 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 29 Apr 2025 08:00:41 -0700
Subject: [PATCH 3/4] Add new test (transformation off).

---
 .../InstCombine/shufflevec-bitcast.ll         | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index c6152368f06fd..468bf85b4a85b 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -240,8 +240,9 @@ define <3 x i4> @shuf_bitcast_wrong_size(<2 x i8> %v, i8 %x) {
 
 define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
 ; CHECK-LABEL: @shuf_bitcast_chain(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[C:%.*]] = bitcast <4 x i32> [[S]] to <16 x i8>
+; CHECK-NEXT:    [[S_BC:%.*]] = bitcast <8 x i32> [[V:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[S_EXTRACT:%.*]] = extractelement <2 x i128> [[S_BC]], i64 0
+; CHECK-NEXT:    [[C:%.*]] = bitcast i128 [[S_EXTRACT]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[C]]
 ;
   %s = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -250,3 +251,26 @@ define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
   %c = bitcast i128 %b to <16 x i8>
   ret <16 x i8> %c
 }
+
+; Same as above, but showing why it's not feasable to implement the reverse
+; fold in VectorCombine (see #136998).
+
+define <4 x i32> @shuf_bitcast_chain_2(<8 x i32> %v) {
+; CHECK-LABEL: @shuf_bitcast_chain_2(
+; CHECK-NEXT:    [[S0_BC:%.*]] = bitcast <8 x i32> [[V:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[S0_EXTRACT:%.*]] = extractelement <2 x i128> [[S0_BC]], i64 0
+; CHECK-NEXT:    [[S1_BC:%.*]] = bitcast <8 x i32> [[V]] to <2 x i128>
+; CHECK-NEXT:    [[S1_EXTRACT:%.*]] = extractelement <2 x i128> [[S1_BC]], i64 1
+; CHECK-NEXT:    [[R1:%.*]] = or i128 [[S0_EXTRACT]], [[S1_EXTRACT]]
+; CHECK-NEXT:    [[R:%.*]] = bitcast i128 [[R1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %s0 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %b0 = bitcast <4 x i32> %s0 to i128
+  %b1 = bitcast <4 x i32> %s1 to i128
+  %c0 = bitcast i128 %b0 to <4 x i32>
+  %c1 = bitcast i128 %b1 to <4 x i32>
+  %r = or <4 x i32> %c0, %c1
+  ret <4 x i32> %r
+}

>From 06ecbaecf2be74031ef61cf9ccb90ad82a828f8d Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 29 Apr 2025 08:01:58 -0700
Subject: [PATCH 4/4] Update tests (transformation on).

---
 .../Transforms/InstCombine/shufflevec-bitcast.ll   | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index 468bf85b4a85b..877dd1eefbae4 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -240,9 +240,8 @@ define <3 x i4> @shuf_bitcast_wrong_size(<2 x i8> %v, i8 %x) {
 
 define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
 ; CHECK-LABEL: @shuf_bitcast_chain(
-; CHECK-NEXT:    [[S_BC:%.*]] = bitcast <8 x i32> [[V:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[S_EXTRACT:%.*]] = extractelement <2 x i128> [[S_BC]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i128 [[S_EXTRACT]] to <16 x i8>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[C:%.*]] = bitcast <4 x i32> [[S]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[C]]
 ;
   %s = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -257,12 +256,9 @@ define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
 
 define <4 x i32> @shuf_bitcast_chain_2(<8 x i32> %v) {
 ; CHECK-LABEL: @shuf_bitcast_chain_2(
-; CHECK-NEXT:    [[S0_BC:%.*]] = bitcast <8 x i32> [[V:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[S0_EXTRACT:%.*]] = extractelement <2 x i128> [[S0_BC]], i64 0
-; CHECK-NEXT:    [[S1_BC:%.*]] = bitcast <8 x i32> [[V]] to <2 x i128>
-; CHECK-NEXT:    [[S1_EXTRACT:%.*]] = extractelement <2 x i128> [[S1_BC]], i64 1
-; CHECK-NEXT:    [[R1:%.*]] = or i128 [[S0_EXTRACT]], [[S1_EXTRACT]]
-; CHECK-NEXT:    [[R:%.*]] = bitcast i128 [[R1]] to <4 x i32>
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[S0]], [[S1]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s0 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>