[llvm] Handle VECREDUCE intrinsics in NVPTX backend (PR #136253)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 21 01:11:49 PDT 2025
================
@@ -320,33 +272,38 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mul.rn.f32 %r8, %r3, %r7;
-; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r5;
-; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r8;
-; CHECK-NEXT: mul.rn.f32 %r11, %r2, %r6;
-; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r4;
-; CHECK-NEXT: mul.rn.f32 %r13, %r10, %r12;
+; CHECK-NEXT: mul.rn.f32 %r8, %r5, %r6;
+; CHECK-NEXT: mul.rn.f32 %r9, %r8, %r7;
+; CHECK-NEXT: mul.rn.f32 %r10, %r3, %r4;
+; CHECK-NEXT: mul.rn.f32 %r11, %r1, %r2;
+; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r10;
+; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
ret float %res
}
-; Check straight line reduction.
define half @reduce_fmax_half(<8 x half> %in) {
; CHECK-LABEL: reduce_fmax_half(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
-; CHECK-NEXT: max.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: max.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: max.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT: max.f16 %rs9, %rs7, %rs8;
+; CHECK-NEXT: max.f16 %rs10, %rs9, %rs5;
+; CHECK-NEXT: max.f16 %rs11, %rs10, %rs6;
+; CHECK-NEXT: max.f16 %rs12, %rs11, %rs3;
+; CHECK-NEXT: max.f16 %rs13, %rs12, %rs4;
+; CHECK-NEXT: max.f16 %rs14, %rs13, %rs1;
+; CHECK-NEXT: max.f16 %rs15, %rs14, %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
----------------
Prince781 wrote:
non-associativity of `fmax` (`maxNum`) here forces sequential reduction.
https://github.com/llvm/llvm-project/pull/136253
More information about the llvm-commits
mailing list