[llvm] [InstCombine] Move extends across identity shuffles. (PR #146901)

Thu Jul 3 07:22:17 PDT 2025

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/146901

Add a new fold to instcombine to move SExt/ZExt across identity
shuffles, applying the cast after the shuffle. This sinks extends and
can enable more general additional folding of both shuffles (and
related instructions) and extends. If backends prefer splitting up doing
casts first, the extends can be hoisted again in VectorCombine for
example.

A larger example is included in the load_i32_zext_to_v4i32. The wider
extend is easier to compute an accurate cost for and targets (like
AArch64) can lower a single wider extend more efficiently than multiple
separate extends.

This is a generalization of a VectorCombine version
(https://github.com/llvm/llvm-project/pull/141109) as suggested by
@preames.

>From 8c76e575a17afd2212d7a24ee512a0a3cac7646d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 3 Jul 2025 14:53:39 +0100
Subject: [PATCH 1/2] [InstCombine] Add tests for moving exts across identity
 shuffles.

Also merges redundant check lines in PhaseOrdering/X86/blendv-select.ll
to reduce test diff in upcoming change.
---
 .../InstCombine/fold-shuffle-ext.ll           | 108 +++++++++++++++
 .../PhaseOrdering/X86/blendv-select.ll        | 130 ++++--------------
 2 files changed, 138 insertions(+), 100 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll

diff --git a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
new file mode 100644
index 0000000000000..c43e9276f20b7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p instcombine -S %s | FileCheck %s
+
+define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(<8 x i8> %x) {
+; CHECK-LABEL: define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(
+; CHECK-SAME: <8 x i8> [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %x to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle
+}
+
+define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(<4 x i16> %x) {
+; CHECK-LABEL: define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[E_1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <3 x i32> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <4 x i16> %x to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %e.1, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %shuffle
+}
+
+define <4 x i16> @ext_no_identity_mask1(<8 x i8> %in) {
+; CHECK-LABEL: define <4 x i16> @ext_no_identity_mask1(
+; CHECK-SAME: <8 x i8> [[IN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[IN]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %in to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @ext_no_identity_mask2(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <4 x i16> @ext_no_identity_mask2(
+; CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %x to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle
+}
+
+define <5 x i32> @ext_identity_mask_first_vector_first_half_5xi32(<4 x i16> %x) {
+; CHECK-LABEL: define <5 x i32> @ext_identity_mask_first_vector_first_half_5xi32(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[E_1]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    ret <5 x i32> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <4 x i16> %x to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %e.1, <4 x i32> %e.1, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <5 x i32> %shuffle
+}
+
+define <4 x i16> @ext_no_identity_mask_first_vector_second_half(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <4 x i16> @ext_no_identity_mask_first_vector_second_half(
+; CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> [[Y]], <4 x i32> <i32 0, i32 9, i32 1, i32 10>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %x to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> %y, <4 x i32> <i32 0, i32 9, i32 1, i32 10>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
+; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
+; CHECK-SAME: ptr [[DI:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
+; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
+; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
+;
+entry:
+  %l = load i32, ptr %di
+  %vec.ins = insertelement <2 x i32> <i32 poison, i32 0>, i32 %l, i64 0
+  %vec.bc = bitcast <2 x i32> %vec.ins to <8 x i8>
+  %e.1 = zext <8 x i8> %vec.bc to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %e.1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %ext.2 = zext nneg <4 x i16> %vec.shuffle to <4 x i32>
+  ret <4 x i32> %ext.2
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
index daf4a7b799dd4..bbf893f6127b0 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
@@ -12,20 +12,10 @@
 ;
 
 define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
-; SSE-LABEL: @x86_pblendvb_v4f64_v2f64(
-; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; SSE-NEXT:    ret <4 x double> [[DOTV]]
-;
-; AVX2-LABEL: @x86_pblendvb_v4f64_v2f64(
-; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; AVX2-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; AVX2-NEXT:    ret <4 x double> [[DOTV]]
-;
-; AVX512-LABEL: @x86_pblendvb_v4f64_v2f64(
-; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[CMP]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; AVX512-NEXT:    ret <4 x double> [[DOTV]]
+; CHECK-LABEL: @x86_pblendvb_v4f64_v2f64(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[CMP]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[DOTV]]
 ;
   %a.bc = bitcast <4 x double> %a to <32 x i8>
   %b.bc = bitcast <4 x double> %b to <32 x i8>
@@ -46,20 +36,10 @@ define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b,
 }
 
 define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
-; SSE-LABEL: @x86_pblendvb_v8f32_v4f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; SSE-NEXT:    ret <8 x float> [[DOTV]]
-;
-; AVX2-LABEL: @x86_pblendvb_v8f32_v4f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX2-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; AVX2-NEXT:    ret <8 x float> [[DOTV]]
-;
-; AVX512-LABEL: @x86_pblendvb_v8f32_v4f32(
-; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; AVX512-NEXT:    ret <8 x float> [[DOTV]]
+; CHECK-LABEL: @x86_pblendvb_v8f32_v4f32(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[DOTV]]
 ;
   %a.bc = bitcast <8 x float> %a to <32 x i8>
   %b.bc = bitcast <8 x float> %b to <32 x i8>
@@ -80,20 +60,10 @@ define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8
 }
 
 define <4 x i64> @x86_pblendvb_v4i64_v2i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; SSE-LABEL: @x86_pblendvb_v4i64_v2i64(
-; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; SSE-NEXT:    ret <4 x i64> [[TMP2]]
-;
-; AVX2-LABEL: @x86_pblendvb_v4i64_v2i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; AVX2-NEXT:    ret <4 x i64> [[TMP2]]
-;
-; AVX512-LABEL: @x86_pblendvb_v4i64_v2i64(
-; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; AVX512-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-LABEL: @x86_pblendvb_v4i64_v2i64(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -216,35 +186,15 @@ define <4 x i64> @x86_pblendvb_v16i16_v8i16(<4 x i64> %a, <4 x i64> %b, <4 x i64
 }
 
 define <4 x i64> @x86_pblendvb_v32i8_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; SSE-LABEL: @x86_pblendvb_v32i8_v16i8(
-; SSE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; SSE-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; SSE-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; SSE-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; SSE-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[TMP3]], <32 x i8> [[TMP2]], <32 x i8> [[TMP1]]
-; SSE-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; SSE-NEXT:    ret <4 x i64> [[RES]]
-;
-; AVX2-LABEL: @x86_pblendvb_v32i8_v16i8(
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; AVX2-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[TMP3]], <32 x i8> [[TMP2]], <32 x i8> [[TMP1]]
-; AVX2-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; AVX2-NEXT:    ret <4 x i64> [[RES]]
-;
-; AVX512-LABEL: @x86_pblendvb_v32i8_v16i8(
-; AVX512-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; AVX512-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[B_BC]], <32 x i8> [[A_BC]]
-; AVX512-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; AVX512-NEXT:    ret <4 x i64> [[RES]]
+; CHECK-LABEL: @x86_pblendvb_v32i8_v16i8(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[B_BC]], <32 x i8> [[A_BC]]
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -424,35 +374,15 @@ define <8 x i64> @x86_pblendvb_v32i16_v16i16(<8 x i64> %a, <8 x i64> %b, <8 x i6
 }
 
 define <8 x i64> @x86_pblendvb_v64i8_v32i8(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
-; SSE-LABEL: @x86_pblendvb_v64i8_v32i8(
-; SSE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
-; SSE-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
-; SSE-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
-; SSE-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
-; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
-; SSE-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[TMP3]], <64 x i8> [[TMP2]], <64 x i8> [[TMP1]]
-; SSE-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
-; SSE-NEXT:    ret <8 x i64> [[RES]]
-;
-; AVX2-LABEL: @x86_pblendvb_v64i8_v32i8(
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
-; AVX2-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[TMP3]], <64 x i8> [[TMP2]], <64 x i8> [[TMP1]]
-; AVX2-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
-; AVX2-NEXT:    ret <8 x i64> [[RES]]
-;
-; AVX512-LABEL: @x86_pblendvb_v64i8_v32i8(
-; AVX512-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
-; AVX512-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[B_BC]], <64 x i8> [[A_BC]]
-; AVX512-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
-; AVX512-NEXT:    ret <8 x i64> [[RES]]
+; CHECK-LABEL: @x86_pblendvb_v64i8_v32i8(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[B_BC]], <64 x i8> [[A_BC]]
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
 ;
   %a.bc = bitcast <8 x i64> %a to <64 x i8>
   %b.bc = bitcast <8 x i64> %b to <64 x i8>

>From 615cca428c34746b0ae1df2e88a714f3f7f1f926 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 3 Jul 2025 12:07:42 +0100
Subject: [PATCH 2/2] [InstCombine] Move extends across identity shuffles.

Add a new fold to instcombine to move SExt/ZExt across identity
shuffles, applying the cast after the shuffle. This sinks extends and
can enable more general additional folding of both shuffles (and
related instructions) and extends. If backends prefer splitting up doing
casts first, the extends can be hoisted again in VectorCombine for
example.

A larger example is included in the load_i32_zext_to_v4i32. The wider
extend is easier to compute an accurate cost for and targets (like
AArch64) can lower a single wider extend more efficiently than multiple
separate extends.

This is a generalization of a VectorCombine version
(https://github.com/llvm/llvm-project/pull/141109/) as suggested by
@preames.
---
 .../InstCombine/InstCombineVectorOps.cpp      | 24 +++++++++++++++----
 .../Transforms/InstCombine/X86/blend_x86.ll   |  2 +-
 .../InstCombine/fold-shuffle-ext.ll           | 16 +++++--------
 .../PhaseOrdering/X86/blendv-select.ll        |  5 ++--
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index a746a5229fb9a..a31fd68dc7165 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2559,17 +2559,16 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf,
 /// Canonicalize casts after shuffle.
 static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
                                     InstCombiner::BuilderTy &Builder) {
-  // Do we have 2 matching cast operands?
   auto *Cast0 = dyn_cast<CastInst>(Shuf.getOperand(0));
-  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
-  if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
-      Cast0->getSrcTy() != Cast1->getSrcTy())
+  if (!Cast0)
     return nullptr;
 
   // TODO: Allow other opcodes? That would require easing the type restrictions
   //       below here.
   CastInst::CastOps CastOpcode = Cast0->getOpcode();
   switch (CastOpcode) {
+  case Instruction::SExt:
+  case Instruction::ZExt:
   case Instruction::FPToSI:
   case Instruction::FPToUI:
   case Instruction::SIToFP:
@@ -2579,15 +2578,30 @@ static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
     return nullptr;
   }
 
+  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
   VectorType *ShufTy = Shuf.getType();
   VectorType *ShufOpTy = cast<VectorType>(Shuf.getOperand(0)->getType());
-  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
 
   // TODO: Allow length-increasing shuffles?
   if (ShufTy->getElementCount().getKnownMinValue() >
       ShufOpTy->getElementCount().getKnownMinValue())
     return nullptr;
 
+  // shuffle (cast X), Y, identity-with-extract-mask -->
+  // cast (shuffle X, Y, identity-with-extract-mask).
+  if (Cast0->hasOneUse() && Shuf.isIdentityWithExtract()) {
+    auto *NewIns = Builder.CreateShuffleVector(Cast0->getOperand(0),
+                                               PoisonValue::get(CastSrcTy),
+                                               Shuf.getShuffleMask());
+    return CastInst::Create(Cast0->getOpcode(), NewIns, Shuf.getType());
+  }
+
+  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
+  // Do we have 2 matching cast operands?
+  if (!Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
+      Cast0->getSrcTy() != Cast1->getSrcTy())
+    return nullptr;
+
   // TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)?
   assert(isa<FixedVectorType>(CastSrcTy) && isa<FixedVectorType>(ShufOpTy) &&
          "Expected fixed vector operands for casts and binary shuffle");
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index aa49f493c9fa1..5c3b0beefbb66 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -287,9 +287,9 @@ define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x f
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
 ; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x float> [[B]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <8 x i32> [[A_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <8 x i32> [[B_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[B_LO]], <4 x i32> [[A_LO]]
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
diff --git a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
index c43e9276f20b7..84ce83d40bee9 100644
--- a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
+++ b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
@@ -5,8 +5,8 @@ define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(<8 x i8> %x) {
 ; CHECK-LABEL: define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(
 ; CHECK-SAME: <8 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
 ; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
 ;
 entry:
@@ -19,8 +19,8 @@ define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(<4 x i16> %x)
 ; CHECK-LABEL: define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(
 ; CHECK-SAME: <4 x i16> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[E_1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[X]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = zext <3 x i16> [[TMP0]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[SHUFFLE]]
 ;
 entry:
@@ -89,12 +89,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L1:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
index bbf893f6127b0..dbce77698eb07 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
@@ -496,9 +496,8 @@ define <2 x i64> @x86_pblendvb_v32i8_v16i8_undefs(<4 x i64> %a, <4 x i64> %b, <4
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i8>
-; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i1> [[CMP]], <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[B_LO]], <16 x i8> [[A_LO]]
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;