[llvm] [NVPTX] add combiner rule for final packed op in reduction (PR #143943)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 13 00:54:42 PDT 2025
Prince781 wrote:
The default expansion of reduction intrinsics happens in `ExpandReductions`, which is an IR-level pass. The pass generates a shuffle reduction sequence, which uses `shufflevector`s to iteratively fold the vector. This sequence is not inherently problematic unless the target supports vector types in a single register. I will investigate if we can have the fixup in DAGCombiner if the final shuffle is legal, as @AlexMaclean suggests.
<details>
<summary>
Here's a trace of what the old behavior is/was on this IR:
```llvm
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
define i16 @reduce_smin_i16(<8 x i16> %in) {
%res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in)
ret i16 %res
}
```
</summary>
```
% llc < reduction-intrinsics.ll -debug-only=isel -mcpu=sm_100 -print-before=expand-reductions -print-after=expand-reductions
*** IR Dump Before Expand reduction intrinsics (expand-reductions) ***
define i16 @reduce_smin_i16(<8 x i16> %in) #0 {
%res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %in)
ret i16 %res
}
*** IR Dump After Expand reduction intrinsics (expand-reductions) ***
define i16 @reduce_smin_i16(<8 x i16> %in) #0 {
%rdx.shuf = shufflevector <8 x i16> %in, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
%rdx.minmax = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %in, <8 x i16> %rdx.shuf)
%rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%rdx.minmax2 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %rdx.minmax, <8 x i16> %rdx.shuf1)
%rdx.shuf3 = shufflevector <8 x i16> %rdx.minmax2, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%rdx.minmax4 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %rdx.minmax2, <8 x i16> %rdx.shuf3)
%1 = extractelement <8 x i16> %rdx.minmax4, i32 0
ret i16 %1
}
FastISel is disabled
=== reduce_smin_i16
Initial selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 24 nodes:
t0: ch,glue = EntryToken
t4: v8i16,ch = load<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64
t5: v2i16 = extract_subvector t4, Constant:i64<0>
t7: v2i16 = extract_subvector t4, Constant:i64<2>
t9: v2i16 = extract_subvector t4, Constant:i64<4>
t11: v2i16 = extract_subvector t4, Constant:i64<6>
t13: v8i16 = vector_shuffle<4,5,6,7,u,u,u,u> t4, poison:v8i16
t14: v8i16 = smin t4, t13
t15: v8i16 = vector_shuffle<2,3,u,u,u,u,u,u> t14, poison:v8i16
t16: v8i16 = smin t14, t15
t17: v8i16 = vector_shuffle<1,u,u,u,u,u,u,u> t16, poison:v8i16
t18: v8i16 = smin t16, t17
t20: i16 = extract_vector_elt t18, Constant:i64<0>
t21: i32 = zero_extend t20
t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
t23: ch = NVPTXISD::RET_GLUE t22
Optimized lowered selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 17 nodes:
t0: ch,glue = EntryToken
t4: v8i16,ch = load<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64
t13: v8i16 = vector_shuffle<4,5,6,7,u,u,u,u> t4, poison:v8i16
t14: v8i16 = smin t4, t13
t15: v8i16 = vector_shuffle<2,3,u,u,u,u,u,u> t14, poison:v8i16
t16: v8i16 = smin t14, t15
t17: v8i16 = vector_shuffle<1,u,u,u,u,u,u,u> t16, poison:v8i16
t18: v8i16 = smin t16, t17
t20: i16 = extract_vector_elt t18, Constant:i64<0>
t21: i32 = zero_extend t20
t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
t23: ch = NVPTXISD::RET_GLUE t22
Type-legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 16 nodes:
t0: ch,glue = EntryToken
t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
t56: v2i16 = smin t26, t26:2
t57: v2i16 = smin t26:1, t26:3
t58: v2i16 = smin t56, t57
t60: v2i16 = vector_shuffle<1,u> t58, undef:v2i16
t61: v2i16 = smin t58, t60
t20: i16 = extract_vector_elt t61, Constant:i64<0>
t21: i32 = zero_extend t20
t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
t23: ch = NVPTXISD::RET_GLUE t22
Optimized type-legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 16 nodes:
t0: ch,glue = EntryToken
t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
t56: v2i16 = smin t26, t26:2
t57: v2i16 = smin t26:1, t26:3
t58: v2i16 = smin t56, t57
t60: v2i16 = vector_shuffle<1,u> t58, undef:v2i16
t61: v2i16 = smin t58, t60
t20: i16 = extract_vector_elt t61, Constant:i64<0>
t21: i32 = zero_extend t20
t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
t23: ch = NVPTXISD::RET_GLUE t22
Legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 18 nodes:
t0: ch,glue = EntryToken
t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
t56: v2i16 = smin t26, t26:2
t57: v2i16 = smin t26:1, t26:3
t58: v2i16 = smin t56, t57
t64: i16 = extract_vector_elt t58, Constant:i64<1>
t66: v2i16 = BUILD_VECTOR t64, undef:i16
t61: v2i16 = smin t58, t66
t20: i16 = extract_vector_elt t61, Constant:i64<0>
t21: i32 = zero_extend t20
t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
t23: ch = NVPTXISD::RET_GLUE t22
Optimized legalized selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 18 nodes:
t0: ch,glue = EntryToken
t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
t56: v2i16 = smin t26, t26:2
t57: v2i16 = smin t26:1, t26:3
t58: v2i16 = smin t56, t57
t64: i16 = extract_vector_elt t58, Constant:i64<1>
t66: v2i16 = BUILD_VECTOR t64, undef:i16
t61: v2i16 = smin t58, t66
t20: i16 = extract_vector_elt t61, Constant:i64<0>
t21: i32 = zero_extend t20
t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
t23: ch = NVPTXISD::RET_GLUE t22
===== Instruction selection begins: %bb.0 ''
ISEL: Starting selection on root node: t23: ch = NVPTXISD::RET_GLUE t22
ISEL: Starting pattern match
Morphed node: t23: ch = Return t22
ISEL: Match complete!
ISEL: Starting selection on root node: t22: ch = NVPTXISD::StoreRetval<(store (s32), align 2)> t0, Constant:i32<0>, t21
ISEL: Starting selection on root node: t21: i32 = zero_extend t20
ISEL: Starting pattern match
Initial Opcode index to 93850
TypeSwitch[i32] from 93851 to 93868
Skipped scope entry (due to false predicate) at index 93870, continuing at 93884
Morphed node: t21: i32 = CVT_u32_u16 t20, TargetConstant:i32<0>
ISEL: Match complete!
ISEL: Starting selection on root node: t20: i16 = extract_vector_elt t61, Constant:i64<0>
ISEL: Starting pattern match
Initial Opcode index to 90513
TypeSwitch[i16] from 90518 to 90521
Morphed node: t20: i16 = I32toI16L_Sink t61
ISEL: Match complete!
ISEL: Starting selection on root node: t61: v2i16 = smin t58, t66
ISEL: Starting pattern match
Initial Opcode index to 91831
Match failed at index 91836
Continuing at 91872
Match failed at index 91873
Continuing at 91882
Match failed at index 91883
Continuing at 91891
Match failed at index 91892
Continuing at 91900
Morphed node: t61: v2i16 = SMIN16x2 t58, t66
ISEL: Match complete!
ISEL: Starting selection on root node: t66: v2i16 = BUILD_VECTOR t64, undef:i16
ISEL: Starting pattern match
Initial Opcode index to 99457
Morphed node: t66: v2i16 = V2I16toI32 t64, undef:i16
ISEL: Match complete!
ISEL: Starting selection on root node: t64: i16 = extract_vector_elt t58, Constant:i64<1>
ISEL: Starting pattern match
Initial Opcode index to 90513
Skipped scope entry (due to false predicate) at index 90516, continuing at 90580
TypeSwitch[i16] from 90583 to 90586
Morphed node: t64: i16 = I32toI16H_Sink t58
ISEL: Match complete!
ISEL: Starting selection on root node: t58: v2i16 = smin t56, t57
ISEL: Starting pattern match
Initial Opcode index to 91831
Match failed at index 91836
Continuing at 91872
Match failed at index 91873
Continuing at 91882
Match failed at index 91883
Continuing at 91891
Match failed at index 91892
Continuing at 91900
Morphed node: t58: v2i16 = SMIN16x2 t56, t57
ISEL: Match complete!
ISEL: Starting selection on root node: t56: v2i16 = smin t26, t26:2
ISEL: Starting pattern match
Initial Opcode index to 91831
Match failed at index 91836
Continuing at 91872
Match failed at index 91873
Continuing at 91882
Match failed at index 91883
Continuing at 91891
Match failed at index 91892
Continuing at 91900
Morphed node: t56: v2i16 = SMIN16x2 t26, t26:2
ISEL: Match complete!
ISEL: Starting selection on root node: t57: v2i16 = smin t26:1, t26:3
ISEL: Starting pattern match
Initial Opcode index to 91831
Match failed at index 91836
Continuing at 91872
Match failed at index 91873
Continuing at 91882
Match failed at index 91883
Continuing at 91891
Match failed at index 91892
Continuing at 91900
Morphed node: t57: v2i16 = SMIN16x2 t26:1, t26:3
ISEL: Match complete!
ISEL: Starting selection on root node: t26: v2i16,v2i16,v2i16,v2i16,ch = NVPTXISD::LoadV4<(dereferenceable invariant load (s128), addrspace 101)> t0, TargetExternalSymbol:i64'reduce_smin_i16_param_0', undef:i64, Constant:i64<0>
ISEL: Starting selection on root node: t65: i16 = undef
ISEL: Starting selection on root node: t1: i64 = TargetExternalSymbol'reduce_smin_i16_param_0'
ISEL: Starting selection on root node: t0: ch,glue = EntryToken
===== Instruction selection ends:
Selected selection DAG: %bb.0 'reduce_smin_i16:'
SelectionDAG has 18 nodes:
t0: ch,glue = EntryToken
t56: v2i16 = SMIN16x2 t72, t72:2
t57: v2i16 = SMIN16x2 t72:1, t72:3
t58: v2i16 = SMIN16x2 t56, t57
t72: v2i16,v2i16,v2i16,v2i16,ch = LDV_i32_v4<Mem:(dereferenceable invariant load (s128), addrspace 101)> TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<101>, TargetConstant:i32<3>, TargetConstant:i32<32>, TargetExternalSymbol:i64'reduce_smin_i16_param_0', TargetConstant:i32<0>, t0
t64: i16 = I32toI16H_Sink t58
t66: v2i16 = V2I16toI32 t64, IMPLICIT_DEF:i16
t61: v2i16 = SMIN16x2 t58, t66
t20: i16 = I32toI16L_Sink t61
t21: i32 = CVT_u32_u16 t20, TargetConstant:i32<0>
t68: ch = StoreRetvalI32<Mem:(store (s32), align 2)> t21, TargetConstant:i32<0>, t0
t23: ch = Return t68
Total amount of phi nodes to update: 0
*** MachineFunction at end of ISel ***
# Machine code for function reduce_smin_i16: IsSSA, TracksLiveness
bb.0 (%ir-block.0):
%0:int32regs, %1:int32regs, %2:int32regs, %3:int32regs = LDV_i32_v4 0, 0, 101, 3, 32, &reduce_smin_i16_param_0, 0 :: (dereferenceable invariant load (s128), addrspace 101)
%4:int32regs = SMIN16x2 killed %1:int32regs, killed %3:int32regs
%5:int32regs = SMIN16x2 killed %0:int32regs, killed %2:int32regs
%6:int32regs = SMIN16x2 killed %5:int32regs, killed %4:int32regs
%7:int16regs = I32toI16H_Sink %6:int32regs
%9:int16regs = IMPLICIT_DEF
%8:int32regs = V2I16toI32 killed %7:int16regs, killed %9:int16regs
%10:int32regs = SMIN16x2 %6:int32regs, killed %8:int32regs
%11:int16regs = I32toI16L_Sink killed %10:int32regs
%12:int32regs = CVT_u32_u16 killed %11:int16regs, 0
StoreRetvalI32 killed %12:int32regs, 0 :: (store (s32), align 2)
Return
# End machine code for function reduce_smin_i16.
//
// Generated by LLVM NVPTX Back-End
//
.version 8.6
.target sm_100
.address_size 64
// .globl reduce_smin_i16 // -- Begin function reduce_smin_i16
// @reduce_smin_i16
.visible .func (.param .b32 func_retval0) reduce_smin_i16(
.param .align 16 .b8 reduce_smin_i16_param_0[16]
)
{
.reg .b16 %rs<4>;
.reg .b32 %r<11>;
// %bb.0:
ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
min.s16x2 %r5, %r2, %r4;
min.s16x2 %r6, %r1, %r3;
min.s16x2 %r7, %r6, %r5;
mov.b32 {_, %rs1}, %r7;
mov.b32 %r8, {%rs1, %rs2};
min.s16x2 %r9, %r7, %r8;
mov.b32 {%rs3, _}, %r9;
cvt.u32.u16 %r10, %rs3;
st.param.b32 [func_retval0], %r10;
ret;
// -- End function
}
```
</details>
https://github.com/llvm/llvm-project/pull/143943
More information about the llvm-commits
mailing list