[llvm] [NVPTX] add combiner rule for final packed op in reduction (PR #143943)

Fri Jun 13 00:54:42 PDT 2025

Prince781 wrote:

The default expansion of reduction intrinsics happens in `ExpandReductions`, which is an IR-level pass. The pass generates a shuffle reduction sequence, which uses `shufflevector`s to iteratively fold the vector. This sequence is not inherently problematic unless the target supports vector types in a single register. I will investigate if we can have the fixup in DAGCombiner if the final shuffle is legal, as @AlexMaclean suggests.

<details>
<summary>
Here's a trace of what the old behavior is/was on this IR:

```llvm
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"

define i16 @reduce_smin_i16(<8 x i16> %in) {
  %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in)
  ret i16 %res
}
```
</summary>

```
% llc < reduction-intrinsics.ll -debug-only=isel -mcpu=sm_100 -print-before=expand-reductions -print-after=expand-reductions
*** IR Dump Before Expand reduction intrinsics (expand-reductions) ***
define i16 @reduce_smin_i16(<8 x i16> %in) #0 {
  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %in)
  ret i16 %res
}
*** IR Dump After Expand reduction intrinsics (expand-reductions) ***
define i16 @reduce_smin_i16(<8 x i16> %in) #0 {
  %rdx.shuf = shufflevector <8 x i16> %in, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
  %rdx.minmax = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %in, <8 x i16> %rdx.shuf)
  %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %rdx.minmax2 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %rdx.minmax, <8 x i16> %rdx.shuf1)
  %rdx.shuf3 = shufflevector <8 x i16> %rdx.minmax2, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %rdx.minmax4 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %rdx.minmax2, <8 x i16> %rdx.shuf3)
  %1 = extractelement <8 x i16> %rdx.minmax4, i32 0
  ret i16 %1
}
	FastISel is disabled

=== reduce_smin_i16

Initial selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 24 nodes:
  t0: ch,glue = EntryToken
  t4: v8i16,ch = load<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64
  t5: v2i16 = extract_subvector t4, Constant:i64<0>
  t7: v2i16 = extract_subvector t4, Constant:i64<2>
  t9: v2i16 = extract_subvector t4, Constant:i64<4>
  t11: v2i16 = extract_subvector t4, Constant:i64<6>
    t13: v8i16 = vector_shuffle<4,5,6,7,u,u,u,u> t4, poison:v8i16
  t14: v8i16 = smin t4, t13
    t15: v8i16 = vector_shuffle<2,3,u,u,u,u,u,u> t14, poison:v8i16
  t16: v8i16 = smin t14, t15
            t17: v8i16 = vector_shuffle<1,u,u,u,u,u,u,u> t16, poison:v8i16
          t18: v8i16 = smin t16, t17
        t20: i16 = extract_vector_elt t18, Constant:i64<0>
      t21: i32 = zero_extend t20
    t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
  t23: ch = NVPTXISD::RET_GLUE t22

Optimized lowered selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 17 nodes:
  t0: ch,glue = EntryToken
  t4: v8i16,ch = load<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64
    t13: v8i16 = vector_shuffle<4,5,6,7,u,u,u,u> t4, poison:v8i16
  t14: v8i16 = smin t4, t13
    t15: v8i16 = vector_shuffle<2,3,u,u,u,u,u,u> t14, poison:v8i16
  t16: v8i16 = smin t14, t15
            t17: v8i16 = vector_shuffle<1,u,u,u,u,u,u,u> t16, poison:v8i16
          t18: v8i16 = smin t16, t17
        t20: i16 = extract_vector_elt t18, Constant:i64<0>
      t21: i32 = zero_extend t20
    t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
  t23: ch = NVPTXISD::RET_GLUE t22

Type-legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 16 nodes:
  t0: ch,glue = EntryToken
  t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
    t56: v2i16 = smin t26, t26:2
    t57: v2i16 = smin t26:1, t26:3
  t58: v2i16 = smin t56, t57
            t60: v2i16 = vector_shuffle<1,u> t58, undef:v2i16
          t61: v2i16 = smin t58, t60
        t20: i16 = extract_vector_elt t61, Constant:i64<0>
      t21: i32 = zero_extend t20
    t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
  t23: ch = NVPTXISD::RET_GLUE t22

Optimized type-legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 16 nodes:
  t0: ch,glue = EntryToken
  t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
    t56: v2i16 = smin t26, t26:2
    t57: v2i16 = smin t26:1, t26:3
  t58: v2i16 = smin t56, t57
            t60: v2i16 = vector_shuffle<1,u> t58, undef:v2i16
          t61: v2i16 = smin t58, t60
        t20: i16 = extract_vector_elt t61, Constant:i64<0>
      t21: i32 = zero_extend t20
    t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
  t23: ch = NVPTXISD::RET_GLUE t22

Legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 18 nodes:
  t0: ch,glue = EntryToken
  t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
    t56: v2i16 = smin t26, t26:2
    t57: v2i16 = smin t26:1, t26:3
  t58: v2i16 = smin t56, t57
              t64: i16 = extract_vector_elt t58, Constant:i64<1>
            t66: v2i16 = BUILD_VECTOR t64, undef:i16
          t61: v2i16 = smin t58, t66
        t20: i16 = extract_vector_elt t61, Constant:i64<0>
      t21: i32 = zero_extend t20
    t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
  t23: ch = NVPTXISD::RET_GLUE t22

Optimized legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 18 nodes:
  t0: ch,glue = EntryToken
  t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
    t56: v2i16 = smin t26, t26:2
    t57: v2i16 = smin t26:1, t26:3
  t58: v2i16 = smin t56, t57
              t64: i16 = extract_vector_elt t58, Constant:i64<1>
            t66: v2i16 = BUILD_VECTOR t64, undef:i16
          t61: v2i16 = smin t58, t66
        t20: i16 = extract_vector_elt t61, Constant:i64<0>
      t21: i32 = zero_extend t20
    t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
  t23: ch = NVPTXISD::RET_GLUE t22

===== Instruction selection begins: %bb.0 ''

ISEL: Starting selection on root node: t23: ch = NVPTXISD::RET_GLUE t22
ISEL: Starting pattern match
  Morphed node: t23: ch = Return t22
ISEL: Match complete!

ISEL: Starting selection on root node: t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21

ISEL: Starting selection on root node: t21: i32 = zero_extend t20
ISEL: Starting pattern match
  Initial Opcode index to 93850
  TypeSwitch[i32] from 93851 to 93868
  Skipped scope entry (due to false predicate) at index 93870, continuing at 93884
  Morphed node: t21: i32 = CVT_u32_u16 t20, TargetConstant:i32<0>
ISEL: Match complete!

ISEL: Starting selection on root node: t20: i16 = extract_vector_elt t61, Constant:i64<0>
ISEL: Starting pattern match
  Initial Opcode index to 90513
  TypeSwitch[i16] from 90518 to 90521
  Morphed node: t20: i16 = I32toI16L_Sink t61
ISEL: Match complete!

ISEL: Starting selection on root node: t61: v2i16 = smin t58, t66
ISEL: Starting pattern match
  Initial Opcode index to 91831
  Match failed at index 91836
  Continuing at 91872
  Match failed at index 91873
  Continuing at 91882
  Match failed at index 91883
  Continuing at 91891
  Match failed at index 91892
  Continuing at 91900
  Morphed node: t61: v2i16 = SMIN16x2 t58, t66
ISEL: Match complete!

ISEL: Starting selection on root node: t66: v2i16 = BUILD_VECTOR t64, undef:i16
ISEL: Starting pattern match
  Initial Opcode index to 99457
  Morphed node: t66: v2i16 = V2I16toI32 t64, undef:i16
ISEL: Match complete!

ISEL: Starting selection on root node: t64: i16 = extract_vector_elt t58, Constant:i64<1>
ISEL: Starting pattern match
  Initial Opcode index to 90513
  Skipped scope entry (due to false predicate) at index 90516, continuing at 90580
  TypeSwitch[i16] from 90583 to 90586
  Morphed node: t64: i16 = I32toI16H_Sink t58
ISEL: Match complete!

ISEL: Starting selection on root node: t58: v2i16 = smin t56, t57
ISEL: Starting pattern match
  Initial Opcode index to 91831
  Match failed at index 91836
  Continuing at 91872
  Match failed at index 91873
  Continuing at 91882
  Match failed at index 91883
  Continuing at 91891
  Match failed at index 91892
  Continuing at 91900
  Morphed node: t58: v2i16 = SMIN16x2 t56, t57
ISEL: Match complete!

ISEL: Starting selection on root node: t56: v2i16 = smin t26, t26:2
ISEL: Starting pattern match
  Initial Opcode index to 91831
  Match failed at index 91836
  Continuing at 91872
  Match failed at index 91873
  Continuing at 91882
  Match failed at index 91883
  Continuing at 91891
  Match failed at index 91892
  Continuing at 91900
  Morphed node: t56: v2i16 = SMIN16x2 t26, t26:2
ISEL: Match complete!

ISEL: Starting selection on root node: t57: v2i16 = smin t26:1, t26:3
ISEL: Starting pattern match
  Initial Opcode index to 91831
  Match failed at index 91836
  Continuing at 91872
  Match failed at index 91873
  Continuing at 91882
  Match failed at index 91883
  Continuing at 91891
  Match failed at index 91892
  Continuing at 91900
  Morphed node: t57: v2i16 = SMIN16x2 t26:1, t26:3
ISEL: Match complete!

ISEL: Starting selection on root node: t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>

ISEL: Starting selection on root node: t65: i16 = undef

ISEL: Starting selection on root node: t1: i64 = TargetExternalSymbol'reduce_smin_i16_param_0'

ISEL: Starting selection on root node: t0: ch,glue = EntryToken

===== Instruction selection ends:

Selected selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 18 nodes:
  t0: ch,glue = EntryToken
    t56: v2i16 = SMIN16x2 t72, t72:2
    t57: v2i16 = SMIN16x2 t72:1, t72:3
  t58: v2i16 = SMIN16x2 t56, t57
  t72: v2i16,v2i16,v2i16,v2i16,ch = LDV_i32_v4<Mem:(dereferenceable invariant load (s128), addrspace 101)> TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<101>, TargetConstant:i32<3>, TargetConstant:i32<32>, TargetExternalSymbol:i64'reduce_smin_i16_param_0', TargetConstant:i32<0>, t0
              t64: i16 = I32toI16H_Sink t58
            t66: v2i16 = V2I16toI32 t64, IMPLICIT_DEF:i16
          t61: v2i16 = SMIN16x2 t58, t66
        t20: i16 = I32toI16L_Sink t61
      t21: i32 = CVT_u32_u16 t20, TargetConstant:i32<0>
    t68: ch = StoreRetvalI32<Mem:(store (s32), align 2)> t21, TargetConstant:i32<0>, t0
  t23: ch = Return t68

Total amount of phi nodes to update: 0
*** MachineFunction at end of ISel ***
# Machine code for function reduce_smin_i16: IsSSA, TracksLiveness

bb.0 (%ir-block.0):
  %0:int32regs, %1:int32regs, %2:int32regs, %3:int32regs = LDV_i32_v4 0, 0, 101, 3, 32, &reduce_smin_i16_param_0, 0 :: (dereferenceable invariant load (s128), addrspace 101)
  %4:int32regs = SMIN16x2 killed %1:int32regs, killed %3:int32regs
  %5:int32regs = SMIN16x2 killed %0:int32regs, killed %2:int32regs
  %6:int32regs = SMIN16x2 killed %5:int32regs, killed %4:int32regs
  %7:int16regs = I32toI16H_Sink %6:int32regs
  %9:int16regs = IMPLICIT_DEF
  %8:int32regs = V2I16toI32 killed %7:int16regs, killed %9:int16regs
  %10:int32regs = SMIN16x2 %6:int32regs, killed %8:int32regs
  %11:int16regs = I32toI16L_Sink killed %10:int32regs
  %12:int32regs = CVT_u32_u16 killed %11:int16regs, 0
  StoreRetvalI32 killed %12:int32regs, 0 :: (store (s32), align 2)
  Return

# End machine code for function reduce_smin_i16.

//
// Generated by LLVM NVPTX Back-End
//

.version 8.6
.target sm_100
.address_size 64

	// .globl	reduce_smin_i16         // -- Begin function reduce_smin_i16
                                        // @reduce_smin_i16
.visible .func  (.param .b32 func_retval0) reduce_smin_i16(
	.param .align 16 .b8 reduce_smin_i16_param_0[16]
)
{
	.reg .b16 	%rs<4>;
	.reg .b32 	%r<11>;

// %bb.0:
	ld.param.v4.b32 	{%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
	min.s16x2 	%r5, %r2, %r4;
	min.s16x2 	%r6, %r1, %r3;
	min.s16x2 	%r7, %r6, %r5;
	mov.b32 	{_, %rs1}, %r7;
	mov.b32 	%r8, {%rs1, %rs2};
	min.s16x2 	%r9, %r7, %r8;
	mov.b32 	{%rs3, _}, %r9;
	cvt.u32.u16 	%r10, %rs3;
	st.param.b32 	[func_retval0], %r10;
	ret;
                                        // -- End function
}
```
</details>

https://github.com/llvm/llvm-project/pull/143943