[llvm] 41c1a7b - [LV] Don't add fixed-order recurrence phis to forced scalars.

Wed Apr 16 13:58:24 PDT 2025

Author: Florian Hahn
Date: 2025-04-16T22:58:10+02:00
New Revision: 41c1a7be3f1a2556e407e761acb766a5d103d691

URL: https://github.com/llvm/llvm-project/commit/41c1a7be3f1a2556e407e761acb766a5d103d691
DIFF: https://github.com/llvm/llvm-project/commit/41c1a7be3f1a2556e407e761acb766a5d103d691.diff

LOG: [LV] Don't add fixed-order recurrence phis to forced scalars.

Fixed-order recurrence phis cannot be forced to be scalar, they will
always be widened at the moment.

Make sure we don't add them to ForcedScalars, otherwise the legacy cost
model will compute incorrect costs.

This fixes an assertion reported with
https://github.com/llvm/llvm-project/pull/129645.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dd7f05465a50b..e9e1d0ac6c196 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6464,10 +6464,15 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
         }
       }
-    } else
+    } else {
+      // Cannot scalarize fixed-order recurrence phis at the moment.
+      if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
+        continue;
+
       // Make sure I gets scalarized and a cost estimate without
       // scalarization overhead.
       ForcedScalars[VF].insert(I);
+    }
   }
 }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index bc57aeb775fb1..838df4d0caf09 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -91,7 +91,7 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) #0 {
+define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) {
 ; CHECK-LABEL: @thirdorderrec(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 3
@@ -352,3 +352,101 @@ loop:
 exit:
   ret void
 }
+
+define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %n) #0 {
+; CHECK-LABEL: @test_for_tried_to_force_scalar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[CONFLICT_RDX20:%.*]] = icmp ule i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX20]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 8, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x ptr> poison, ptr [[A:%.*]], i32 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP28]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    store float [[TMP30]], ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
+; CHECK-NEXT:    [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4
+; CHECK-NEXT:    store float [[TMP36]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PREV:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NEXT]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load float, ptr [[NEXT]], align 4
+; CHECK-NEXT:    store float [[TMP40]], ptr [[C]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = load float, ptr [[PREV]], align 4
+; CHECK-NEXT:    store float [[TMP41]], ptr [[B]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %prev = phi ptr [ %A, %entry ], [ %next, %loop ]
+  %next = getelementptr nusw [3 x float], ptr %A, i64 %iv
+  %0 = load float, ptr %next, align 4
+  store float %0, ptr %C, align 4
+  %1 = load float, ptr %prev, align 4
+  store float %1, ptr %B, align 4
+  %iv.next = add nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver3" }