[llvm] DAG: Fix vector bin op scalarize defining a partially undef vector (PR #122459)

Thu Jan 16 04:54:54 PST 2025

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122459

>From f83583a7283f9e7c1c1306025898a52d33247b31 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 10 Jan 2025 16:18:18 +0700
Subject: [PATCH 1/6] DAG: Fix vector bin op scalarize defining a partially
 undef vector

This avoids some of the pending regressions after AMDGPU implements
isExtractVecEltCheap.

In a case like shl <value, undef>, splat k, because the second operand
was fully defined, we would fall through and use the splat value for the
first operand, losing the undef high bits. This would result in an additional
instruction to handle the high bits. Add some reduced testcases for different
opcodes for one of the regressions.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   8 +-
 llvm/test/CodeGen/AMDGPU/trunc-combine.ll     | 331 ++++++++++++++++++
 2 files changed, 337 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 02b79c67af3ee0..7f58c3090eb640 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27533,8 +27533,12 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   // If all lanes but 1 are undefined, no need to splat the scalar result.
   // TODO: Keep track of undefs and use that info in the general case.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
-      count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
-      count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
+      // This is assuming if either input is undef, the result will fold out.
+      //
+      // TODO: Do we need to check if the opcode/operand propagates undef?
+      // Should we ignore operation identity values?
+      ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) ||
+       (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
     // build_vec ..undef, (bo X, Y), undef...
     SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index aa3e05fdbdb36a..02e30b6c68e994 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -156,3 +156,334 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
   %trunc = trunc <2 x i64> %arg0 to <2 x i16>
   ret <2 x i16> %trunc
 }
+
+; Test for regression where an unnecessary v_alignbit_b32 was inserted
+; on the final result, due to losing the fact that the upper half of
+; the lhs vector was undef.
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = lshr <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshr_b32_e32 v0, 16, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e64 v0, v0, 16
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = lshr <2 x i32> splat (i32 16), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %ashr = ashr <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %ashr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_ashr_i32_e32 v0, -4, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e64 v0, v0, -4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = ashr <2 x i32> splat (i32 -4), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_add_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = add <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshl_b32_e32 v0, 2, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = shl <2 x i32> splat (i32 2), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = sub <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0xffff
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_xor_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_xor_b32_e32 v0, 17, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_xor_b32_e32 v0, 17, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = xor <2 x i32> %undef.hi.elt, splat (i32 17)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffc, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %shl = shl <2 x i32> %undef.hi.elt, splat (i32 2)
+  %trunc = trunc <2 x i32> %shl to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, 18
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mul_lo_u32 v0, v0, 18
+; VI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_i32 v0, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_i32 v0, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; VI-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = sdiv <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_srem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_i32 v1, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; SI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_i32 v1, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; VI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = srem <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+
+define <2 x i16> @vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_u32 v0, v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_bfe_u32 v0, v0, 2, 16
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_u32 v0, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = udiv <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_urem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_u32 v1, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_u32 v1, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; VI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = urem <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}

>From f9ada5a7c24f272c046a20028483ccf320bf5eae Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 14 Jan 2025 21:21:46 +0700
Subject: [PATCH 2/6] Just use insert_vector_elt

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7f58c3090eb640..4e94fb369cc81d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27532,18 +27532,11 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
 
   // If all lanes but 1 are undefined, no need to splat the scalar result.
   // TODO: Keep track of undefs and use that info in the general case.
-  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
-      // This is assuming if either input is undef, the result will fold out.
-      //
-      // TODO: Do we need to check if the opcode/operand propagates undef?
-      // Should we ignore operation identity values?
-      ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) ||
-       (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) {
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
-    // build_vec ..undef, (bo X, Y), undef...
-    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
-    Ops[Index0] = ScalarBO;
-    return DAG.getBuildVector(VT, DL, Ops);
+    //   insert_vector_elt undef, (bo X, Y), index
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, DAG.getUNDEF(VT),
+                       ScalarBO, IndexC);
   }
 
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index

>From 7e48649e5c79490a9040d8d878f903eea6bf52dd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 15 Jan 2025 10:44:34 +0700
Subject: [PATCH 3/6] Check if getNode folded to undef instead

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4e94fb369cc81d..a62c20ba0ae51a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27535,8 +27535,24 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
     //   insert_vector_elt undef, (bo X, Y), index
-    return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, DAG.getUNDEF(VT),
-                       ScalarBO, IndexC);
+
+    SmallVector<SDValue, 16> EltsX, EltsY;
+    DAG.ExtractVectorElements(Src0, EltsX);
+    DAG.ExtractVectorElements(Src1, EltsY);
+
+    SmallVector<SDValue, 16> EltsResult;
+
+    unsigned NonUndefElements = 0;
+    for (auto [X, Y] : zip(EltsX, EltsY)) {
+      SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
+      if (!ScalarBO.isUndef())
+        if (NonUndefElements++ > 1)
+          break;
+      EltsResult.push_back(ScalarBO);
+    }
+
+    if (NonUndefElements == 1)
+      return DAG.getBuildVector(VT, DL, EltsResult);
   }
 
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index

>From 4c6f15fe6909f2f97f75adb4c1f48ff6c89becd4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 15 Jan 2025 10:47:41 +0700
Subject: [PATCH 4/6] Don't bother counting the undefs

It folds to undef or a constant and seems to have the best result
anyway rather than overdefining by splatting the variable value.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 31 ++++++-------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a62c20ba0ae51a..e77fa9ca59a011 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27525,36 +27525,25 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT))
     return SDValue();
 
-  SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
-  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
-  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
-  SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
-
   // If all lanes but 1 are undefined, no need to splat the scalar result.
   // TODO: Keep track of undefs and use that info in the general case.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
-    //   insert_vector_elt undef, (bo X, Y), index
-
-    SmallVector<SDValue, 16> EltsX, EltsY;
+    //   build_vec ..undef, (bo X, Y), undef...
+    SmallVector<SDValue, 16> EltsX, EltsY, EltsResult;
     DAG.ExtractVectorElements(Src0, EltsX);
     DAG.ExtractVectorElements(Src1, EltsY);
 
-    SmallVector<SDValue, 16> EltsResult;
-
-    unsigned NonUndefElements = 0;
-    for (auto [X, Y] : zip(EltsX, EltsY)) {
-      SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
-      if (!ScalarBO.isUndef())
-        if (NonUndefElements++ > 1)
-          break;
-      EltsResult.push_back(ScalarBO);
-    }
-
-    if (NonUndefElements == 1)
-      return DAG.getBuildVector(VT, DL, EltsResult);
+    for (auto [X, Y] : zip(EltsX, EltsY))
+      EltsResult.push_back(DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()));
+    return DAG.getBuildVector(VT, DL, EltsResult);
   }
 
+  SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
+  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
+  SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
+
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
   return DAG.getSplat(VT, DL, ScalarBO);
 }

>From ac0df733193f021b3b680d6ca6b90101f6639b71 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 16 Jan 2025 18:42:15 +0700
Subject: [PATCH 5/6] Update comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e77fa9ca59a011..1db45e4eda9ba0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27525,8 +27525,8 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT))
     return SDValue();
 
-  // If all lanes but 1 are undefined, no need to splat the scalar result.
-  // TODO: Keep track of undefs and use that info in the general case.
+  // If all result lanes but 1 are undefined, no need to splat the scalar
+  // result.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
     //   build_vec ..undef, (bo X, Y), undef...

>From ea735413cd1f49e142f9c618c7940b9835adcfea Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 16 Jan 2025 19:54:29 +0700
Subject: [PATCH 6/6] Fix comment again

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1db45e4eda9ba0..ab5e18f5588d7b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27525,9 +27525,11 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT))
     return SDValue();
 
-  // If all result lanes but 1 are undefined, no need to splat the scalar
-  // result.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
+    // All but one element should have an undef input, which will fold to a
+    // constant or undef. Avoid splatting which would over-define potentially
+    // undefined elements.
+
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
     //   build_vec ..undef, (bo X, Y), undef...
     SmallVector<SDValue, 16> EltsX, EltsY, EltsResult;