[llvm] [VPlan] Avoid getting flags from UV in partial-red (PR #190214)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 10:11:25 PDT 2026
https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/190214
We should avoid looking at the underlying value of a Widen to get the wrap flags, as the correct flags are available via the VPIRFlags inheritance. The patch probably fixes an underlying miscompile.
>From 9ea0a14a4f69edc3db1c49ab4d4e43fdc2e63d66 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Thu, 2 Apr 2026 18:02:31 +0100
Subject: [PATCH] [VPlan] Avoid getting flags from UV in partial-red
We should avoid looking at the underlying value of a Widen to get the
wrap flags, as the correct flags are available via the VPIRFlags
inheritance. The patch probably fixes an underlying miscompile.
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 8 +++-----
.../LoopVectorize/AArch64/partial-reduce-chained.ll | 12 ++++++------
2 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9944df4778774..2a5f85f7e9a3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5995,11 +5995,9 @@ static void transformToPartialReduction(const VPPartialReductionChain &Chain,
VPBuilder Builder(WidenRecipe);
Type *ElemTy = TypeInfo.inferScalarType(BinOp);
auto *Zero = Plan.getZero(ElemTy);
- VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
- ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
- : VPIRFlags();
- auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
- VPIRMetadata(), DebugLoc::getUnknown());
+ auto *NegRecipe =
+ new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, *WidenRecipe,
+ VPIRMetadata(), DebugLoc::getUnknown());
Builder.insert(NegRecipe);
BinOp = NegRecipe;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 15e0220b71d61..6e8d27989ab5d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -338,7 +338,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
@@ -380,7 +380,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP16]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP11]]
@@ -422,7 +422,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
@@ -813,7 +813,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
@@ -858,7 +858,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 16 x i32> [[TMP15]], [[TMP17]]
-; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP18]]
+; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP18]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP15]], [[TMP11]]
@@ -903,7 +903,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
More information about the llvm-commits
mailing list