[llvm] DAG: Fix vector bin op scalarize defining a partially undef vector (PR #122459)

Tue Jan 14 20:16:56 PST 2025

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122459

>From d12bd2518aed188d6986186cd22bcdcd56de7fa6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 10 Jan 2025 16:18:18 +0700
Subject: [PATCH 1/4] DAG: Fix vector bin op scalarize defining a partially
 undef vector

This avoids some of the pending regressions after AMDGPU implements
isExtractVecEltCheap.

In a case like shl <value, undef>, splat k, because the second operand
was fully defined, we would fall through and use the splat value for the
first operand, losing the undef high bits. This would result in an additional
instruction to handle the high bits. Add some reduced testcases for different
opcodes for one of the regressions.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   8 +-
 llvm/test/CodeGen/AMDGPU/trunc-combine.ll     | 331 ++++++++++++++++++
 2 files changed, 337 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index da3c834417d6b2..712e52ee8fc921 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27525,8 +27525,12 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   // If all lanes but 1 are undefined, no need to splat the scalar result.
   // TODO: Keep track of undefs and use that info in the general case.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
-      count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
-      count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
+      // This is assuming if either input is undef, the result will fold out.
+      //
+      // TODO: Do we need to check if the opcode/operand propagates undef?
+      // Should we ignore operation identity values?
+      ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) ||
+       (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
     // build_vec ..undef, (bo X, Y), undef...
     SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index aa3e05fdbdb36a..02e30b6c68e994 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -156,3 +156,334 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
   %trunc = trunc <2 x i64> %arg0 to <2 x i16>
   ret <2 x i16> %trunc
 }
+
+; Test for regression where an unnecessary v_alignbit_b32 was inserted
+; on the final result, due to losing the fact that the upper half of
+; the lhs vector was undef.
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = lshr <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshr_b32_e32 v0, 16, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e64 v0, v0, 16
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = lshr <2 x i32> splat (i32 16), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %ashr = ashr <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %ashr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_ashr_i32_e32 v0, -4, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e64 v0, v0, -4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = ashr <2 x i32> splat (i32 -4), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_add_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = add <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshl_b32_e32 v0, 2, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = shl <2 x i32> splat (i32 2), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = sub <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0xffff
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_xor_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_xor_b32_e32 v0, 17, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_xor_b32_e32 v0, 17, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = xor <2 x i32> %undef.hi.elt, splat (i32 17)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffc, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %shl = shl <2 x i32> %undef.hi.elt, splat (i32 2)
+  %trunc = trunc <2 x i32> %shl to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, 18
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mul_lo_u32 v0, v0, 18
+; VI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_i32 v0, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_i32 v0, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; VI-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = sdiv <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_srem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_i32 v1, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; SI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_i32 v1, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; VI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = srem <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+
+define <2 x i16> @vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_u32 v0, v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_bfe_u32 v0, v0, 2, 16
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_u32 v0, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = udiv <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_urem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_u32 v1, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_u32 v1, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; VI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = urem <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}

>From 673b88fbbe885f4a0e5192cfddd4a1de4745f73d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 14 Jan 2025 21:21:46 +0700
Subject: [PATCH 2/4] Just use insert_vector_elt

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 712e52ee8fc921..51381b85a5e1b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27524,18 +27524,11 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
 
   // If all lanes but 1 are undefined, no need to splat the scalar result.
   // TODO: Keep track of undefs and use that info in the general case.
-  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
-      // This is assuming if either input is undef, the result will fold out.
-      //
-      // TODO: Do we need to check if the opcode/operand propagates undef?
-      // Should we ignore operation identity values?
-      ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) ||
-       (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) {
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
-    // build_vec ..undef, (bo X, Y), undef...
-    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
-    Ops[Index0] = ScalarBO;
-    return DAG.getBuildVector(VT, DL, Ops);
+    //   insert_vector_elt undef, (bo X, Y), index
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, DAG.getUNDEF(VT),
+                       ScalarBO, IndexC);
   }
 
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index

>From 667942923f57d80619f4416d4131a7785f5da4e1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 15 Jan 2025 10:44:34 +0700
Subject: [PATCH 3/4] Check if getNode folded to undef instead

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 51381b85a5e1b6..7cd1374f3e2e72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27527,8 +27527,24 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
     //   insert_vector_elt undef, (bo X, Y), index
-    return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, DAG.getUNDEF(VT),
-                       ScalarBO, IndexC);
+
+    SmallVector<SDValue, 16> EltsX, EltsY;
+    DAG.ExtractVectorElements(Src0, EltsX);
+    DAG.ExtractVectorElements(Src1, EltsY);
+
+    SmallVector<SDValue, 16> EltsResult;
+
+    unsigned NonUndefElements = 0;
+    for (auto [X, Y] : zip(EltsX, EltsY)) {
+      SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
+      if (!ScalarBO.isUndef())
+        if (NonUndefElements++ > 1)
+          break;
+      EltsResult.push_back(ScalarBO);
+    }
+
+    if (NonUndefElements == 1)
+      return DAG.getBuildVector(VT, DL, EltsResult);
   }
 
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index

>From 0fd42cfc29f65bf7bc5cfda0f1c1e0ececc69825 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 15 Jan 2025 10:47:41 +0700
Subject: [PATCH 4/4] Don't bother counting the undefs

It folds to undef or a constant and seems to have the best result
anyway rather than overdefining by splatting the variable value.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 31 ++++++-------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7cd1374f3e2e72..b033094475d32e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27517,36 +27517,25 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT))
     return SDValue();
 
-  SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
-  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
-  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
-  SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
-
   // If all lanes but 1 are undefined, no need to splat the scalar result.
   // TODO: Keep track of undefs and use that info in the general case.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
     // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
-    //   insert_vector_elt undef, (bo X, Y), index
-
-    SmallVector<SDValue, 16> EltsX, EltsY;
+    //   build_vec ..undef, (bo X, Y), undef...
+    SmallVector<SDValue, 16> EltsX, EltsY, EltsResult;
     DAG.ExtractVectorElements(Src0, EltsX);
     DAG.ExtractVectorElements(Src1, EltsY);
 
-    SmallVector<SDValue, 16> EltsResult;
-
-    unsigned NonUndefElements = 0;
-    for (auto [X, Y] : zip(EltsX, EltsY)) {
-      SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
-      if (!ScalarBO.isUndef())
-        if (NonUndefElements++ > 1)
-          break;
-      EltsResult.push_back(ScalarBO);
-    }
-
-    if (NonUndefElements == 1)
-      return DAG.getBuildVector(VT, DL, EltsResult);
+    for (auto [X, Y] : zip(EltsX, EltsY))
+      EltsResult.push_back(DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()));
+    return DAG.getBuildVector(VT, DL, EltsResult);
   }
 
+  SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
+  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
+  SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
+
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
   return DAG.getSplat(VT, DL, ScalarBO);
 }