[llvm] 577c7dd - [AArch64] Add a phase-ordering test for vectorizing predicated selects. NFC

Fri Oct 25 07:20:29 PDT 2024

Author: David Green
Date: 2024-10-25T15:20:24+01:00
New Revision: 577c7dd7cc4c5a9f62f9654cfa30ee9d55709426

URL: https://github.com/llvm/llvm-project/commit/577c7dd7cc4c5a9f62f9654cfa30ee9d55709426
DIFF: https://github.com/llvm/llvm-project/commit/577c7dd7cc4c5a9f62f9654cfa30ee9d55709426.diff

LOG: [AArch64] Add a phase-ordering test for vectorizing predicated selects. NFC

Added: 
    llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
new file mode 100644
index 00000000000000..7274e952567693

--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef %RAND_BLOCK_LENGTH, ptr noundef %samples, double noundef nofpclass(nan inf) %Y, double noundef nofpclass(nan inf) %Z) {
+; CHECK-LABEL: define nofpclass(nan inf) double @monte_simple(
+; CHECK-SAME: i32 noundef [[NBLOCKS:%.*]], i32 noundef [[RAND_BLOCK_LENGTH:%.*]], ptr nocapture noundef readonly [[SAMPLES:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[RAND_BLOCK_LENGTH]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[RAND_BLOCK_LENGTH]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_011:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[V1_1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V0_010:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[V0_1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast ogt double [[SUB]], 0.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[SUB]], [[V0_010]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast double [[MUL3]], [[V1_011]]
+; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], double [[ADD]], double [[V0_010]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP1]], double [[ADD4]], double [[V1_011]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[ADD5]]
+;
+entry:
+  %nblocks.addr = alloca i32, align 4
+  %RAND_BLOCK_LENGTH.addr = alloca i32, align 4
+  %samples.addr = alloca ptr, align 8
+  %Y.addr = alloca double, align 8
+  %Z.addr = alloca double, align 8
+  %i = alloca i32, align 4
+  %block = alloca i32, align 4
+  %rngVal = alloca double, align 8
+  %callValue = alloca double, align 8
+  %v0 = alloca double, align 8
+  %v1 = alloca double, align 8
+  store i32 %nblocks, ptr %nblocks.addr, align 4
+  store i32 %RAND_BLOCK_LENGTH, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  store ptr %samples, ptr %samples.addr, align 8
+  store double %Y, ptr %Y.addr, align 8
+  store double %Z, ptr %Z.addr, align 8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %block) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr %rngVal) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr %callValue) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v0) #2
+  store double 0.000000e+00, ptr %v0, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v1) #2
+  store double 0.000000e+00, ptr %v1, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %samples.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds float, ptr %2, i64 %idxprom
+  %4 = load float, ptr %arrayidx, align 4
+  %conv = fpext float %4 to double
+  store double %conv, ptr %rngVal, align 8
+  %5 = load double, ptr %Y.addr, align 8
+  %6 = load double, ptr %rngVal, align 8
+  %mul = fmul fast double %5, %6
+  %7 = load double, ptr %Z.addr, align 8
+  %sub = fsub fast double %mul, %7
+  store double %sub, ptr %callValue, align 8
+  %8 = load double, ptr %callValue, align 8
+  %cmp1 = fcmp fast ogt double %8, 0.000000e+00
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %9 = load double, ptr %callValue, align 8
+  %10 = load double, ptr %v0, align 8
+  %add = fadd fast double %10, %9
+  store double %add, ptr %v0, align 8
+  %11 = load double, ptr %callValue, align 8
+  %12 = load double, ptr %callValue, align 8
+  %mul3 = fmul fast double %11, %12
+  %13 = load double, ptr %v1, align 8
+  %add4 = fadd fast double %13, %mul3
+  store double %add4, ptr %v1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %14 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %14, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %15 = load double, ptr %v0, align 8
+  %16 = load double, ptr %v1, align 8
+  %add5 = fadd fast double %15, %16
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v1) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v0) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr %callValue) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr %rngVal) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %block) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2
+  ret double %add5
+}
+
+define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %RAND_BLOCK_LENGTH, ptr noundef %samples, double noundef nofpclass(nan inf) %Y, double noundef nofpclass(nan inf) %Z) {
+; CHECK-LABEL: define nofpclass(nan inf) double @monte_exp(
+; CHECK-SAME: i32 noundef [[NBLOCKS:%.*]], i32 noundef [[RAND_BLOCK_LENGTH:%.*]], ptr noundef [[SAMPLES:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[NBLOCKS]], 0
+; CHECK-NEXT:    br i1 [[CMP16]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END10:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP211:%.*]] = icmp sgt i32 [[RAND_BLOCK_LENGTH]], 0
+; CHECK-NEXT:    br i1 [[CMP211]], label %[[FOR_BODY_US_PREHEADER:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[RAND_BLOCK_LENGTH]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[V1_019_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V0_018_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
+; CHECK-NEXT:    br label %[[FOR_BODY3_US:.*]]
+; CHECK:       [[FOR_BODY3_US]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ]
+; CHECK-NEXT:    [[V1_114_US:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ]
+; CHECK-NEXT:    [[V0_113_US:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[V0_2_US]], %[[FOR_BODY3_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[CONV_US:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]])
+; CHECK-NEXT:    [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[SUB_US:%.*]] = fsub fast double [[MUL_US]], [[Z]]
+; CHECK-NEXT:    [[CMP4_US:%.*]] = fcmp fast ogt double [[SUB_US]], 0.000000e+00
+; CHECK-NEXT:    [[ADD_US:%.*]] = fadd fast double [[SUB_US]], [[V0_113_US]]
+; CHECK-NEXT:    [[MUL6_US:%.*]] = fmul fast double [[SUB_US]], [[SUB_US]]
+; CHECK-NEXT:    [[ADD7_US:%.*]] = fadd fast double [[MUL6_US]], [[V1_114_US]]
+; CHECK-NEXT:    [[V0_2_US]] = select i1 [[CMP4_US]], double [[ADD_US]], double [[V0_113_US]]
+; CHECK-NEXT:    [[V1_2_US]] = select i1 [[CMP4_US]], double [[ADD7_US]], double [[V1_114_US]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]]
+; CHECK:       [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]:
+; CHECK-NEXT:    [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1
+; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END10]]:
+; CHECK-NEXT:    [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]]
+; CHECK-NEXT:    ret double [[ADD11]]
+;
+entry:
+  %nblocks.addr = alloca i32, align 4
+  %RAND_BLOCK_LENGTH.addr = alloca i32, align 4
+  %samples.addr = alloca ptr, align 8
+  %Y.addr = alloca double, align 8
+  %Z.addr = alloca double, align 8
+  %i = alloca i32, align 4
+  %block = alloca i32, align 4
+  %rngVal = alloca double, align 8
+  %callValue = alloca double, align 8
+  %v0 = alloca double, align 8
+  %v1 = alloca double, align 8
+  store i32 %nblocks, ptr %nblocks.addr, align 4
+  store i32 %RAND_BLOCK_LENGTH, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  store ptr %samples, ptr %samples.addr, align 8
+  store double %Y, ptr %Y.addr, align 8
+  store double %Z, ptr %Z.addr, align 8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %block) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr %rngVal) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr %callValue) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v0) #4
+  store double 0.000000e+00, ptr %v0, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v1) #4
+  store double 0.000000e+00, ptr %v1, align 8
+  store i32 0, ptr %block, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc8, %entry
+  %0 = load i32, ptr %block, align 4
+  %1 = load i32, ptr %nblocks.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end10
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  %3 = load ptr, ptr %samples.addr, align 8
+  call void @resample(i32 noundef %2, ptr noundef %3)
+  store i32 0, ptr %i, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %4 = load i32, ptr %i, align 4
+  %5 = load i32, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  %cmp2 = icmp slt i32 %4, %5
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %6 = load ptr, ptr %samples.addr, align 8
+  %7 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %7 to i64
+  %arrayidx = getelementptr inbounds float, ptr %6, i64 %idxprom
+  %8 = load float, ptr %arrayidx, align 4
+  %conv = fpext float %8 to double
+  store double %conv, ptr %rngVal, align 8
+  %9 = load double, ptr %Y.addr, align 8
+  %10 = load double, ptr %rngVal, align 8
+  %11 = call fast double @llvm.exp2.f64(double %10)
+  %mul = fmul fast double %9, %11
+  %12 = load double, ptr %Z.addr, align 8
+  %sub = fsub fast double %mul, %12
+  store double %sub, ptr %callValue, align 8
+  %13 = load double, ptr %callValue, align 8
+  %cmp4 = fcmp fast ogt double %13, 0.000000e+00
+  br i1 %cmp4, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body3
+  %14 = load double, ptr %callValue, align 8
+  %15 = load double, ptr %v0, align 8
+  %add = fadd fast double %15, %14
+  store double %add, ptr %v0, align 8
+  %16 = load double, ptr %callValue, align 8
+  %17 = load double, ptr %callValue, align 8
+  %mul6 = fmul fast double %16, %17
+  %18 = load double, ptr %v1, align 8
+  %add7 = fadd fast double %18, %mul6
+  store double %add7, ptr %v1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body3
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %19 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %19, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.end
+  %20 = load i32, ptr %block, align 4
+  %inc9 = add nsw i32 %20, 1
+  store i32 %inc9, ptr %block, align 4
+  br label %for.cond
+
+for.end10:                                        ; preds = %for.cond
+  %21 = load double, ptr %v0, align 8
+  %22 = load double, ptr %v1, align 8
+  %add11 = fadd fast double %21, %22
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v1) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v0) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr %callValue) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr %rngVal) #4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %block) #4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #4
+  ret double %add11
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @resample(i32 noundef, ptr noundef)
+declare double @llvm.exp2.f64(double)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)