[clang] 1ec469c - [OPENMP50]Codegen for scan directives in parallel for regions.

Alexey Bataev via cfe-commits cfe-commits at lists.llvm.org
Thu Jun 18 08:59:58 PDT 2020


Author: Alexey Bataev
Date: 2020-06-18T11:56:55-04:00
New Revision: 1ec469cf4c20a0f80ab6507e3838cfcc3e4f9595

URL: https://github.com/llvm/llvm-project/commit/1ec469cf4c20a0f80ab6507e3838cfcc3e4f9595
DIFF: https://github.com/llvm/llvm-project/commit/1ec469cf4c20a0f80ab6507e3838cfcc3e4f9595.diff

LOG: [OPENMP50]Codegen for scan directives in parallel for regions.

Summary:
Added codegen for scan directives in parallel for regions.

Emits the code for the directive with inscan reductions.
Original code:
```
 #pragma omp parallel for reduction(inscan, op : ...)
 for() {
   <input phase>;
   #pragma omp scan (in)exclusive(...)
   <scan phase>
 }
```
is transformed to something:

```
 #pragma omp parallel
{
size num_iters = <num_iters>;
<type> buffer[num_iters];
 #pragma omp for
for (i: 0..<num_iters>) {
  <input phase>;
  buffer[i] = red;
}
 #pragma omp barrier
for (int k = 0; k != ceil(log2(num_iters)); ++k)
for (size cnt = last_iter; cnt >= pow(2, k); --k)
  buffer[i] op= buffer[i-pow(2,k)];
 #pragma omp for
for (0..<num_iters>) {
  red = InclusiveScan ? buffer[i] : buffer[i-1];
  <scan phase>;
}
}
```

Reviewers: jdoerfert

Subscribers: yaxunl, guansong, sstefan1, cfe-commits, caomhin

Tags: #clang

Differential Revision: https://reviews.llvm.org/D81478

Added: 
    clang/test/OpenMP/parallel_for_scan_codegen.cpp

Modified: 
    clang/lib/CodeGen/CGStmtOpenMP.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e17bc184c93a..a91ca3d31a6b 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -3203,41 +3203,53 @@ static void emitScanBasedDirective(
   SecondGen(CGF);
 }
 
+static bool emitWorksharingDirective(CodeGenFunction &CGF,
+                                     const OMPLoopDirective &S,
+                                     bool HasCancel) {
+  bool HasLastprivates;
+  if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
+                   [](const OMPReductionClause *C) {
+                     return C->getModifier() == OMPC_REDUCTION_inscan;
+                   })) {
+    const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) {
+      CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
+      OMPLoopScope LoopScope(CGF, S);
+      return CGF.EmitScalarExpr(S.getNumIterations());
+    };
+    const auto &&FirstGen = [&S, HasCancel](CodeGenFunction &CGF) {
+      CodeGenFunction::OMPCancelStackRAII CancelRegion(
+          CGF, S.getDirectiveKind(), HasCancel);
+      (void)CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
+                                       emitForLoopBounds,
+                                       emitDispatchForLoopBounds);
+      // Emit an implicit barrier at the end.
+      CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, S.getBeginLoc(),
+                                                 OMPD_for);
+    };
+    const auto &&SecondGen = [&S, HasCancel,
+                              &HasLastprivates](CodeGenFunction &CGF) {
+      CodeGenFunction::OMPCancelStackRAII CancelRegion(
+          CGF, S.getDirectiveKind(), HasCancel);
+      HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
+                                                   emitForLoopBounds,
+                                                   emitDispatchForLoopBounds);
+    };
+    emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen);
+  } else {
+    CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, S.getDirectiveKind(),
+                                                     HasCancel);
+    HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
+                                                 emitForLoopBounds,
+                                                 emitDispatchForLoopBounds);
+  }
+  return HasLastprivates;
+}
+
 void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
   bool HasLastprivates = false;
   auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
                                           PrePostActionTy &) {
-    if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
-                     [](const OMPReductionClause *C) {
-                       return C->getModifier() == OMPC_REDUCTION_inscan;
-                     })) {
-      const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) {
-        OMPLocalDeclMapRAII Scope(CGF);
-        OMPLoopScope LoopScope(CGF, S);
-        return CGF.EmitScalarExpr(S.getNumIterations());
-      };
-      const auto &&FirstGen = [&S](CodeGenFunction &CGF) {
-        OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
-        (void)CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                         emitForLoopBounds,
-                                         emitDispatchForLoopBounds);
-        // Emit an implicit barrier at the end.
-        CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, S.getBeginLoc(),
-                                                   OMPD_for);
-      };
-      const auto &&SecondGen = [&S, &HasLastprivates](CodeGenFunction &CGF) {
-        OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
-        HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                                     emitForLoopBounds,
-                                                     emitDispatchForLoopBounds);
-      };
-      emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen);
-    } else {
-      OMPCancelStackRAII CancelRegion(CGF, OMPD_for, S.hasCancel());
-      HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                                   emitForLoopBounds,
-                                                   emitDispatchForLoopBounds);
-    }
+    HasLastprivates = emitWorksharingDirective(CGF, S, S.hasCancel());
   };
   {
     auto LPCRegion =
@@ -3258,34 +3270,7 @@ void CodeGenFunction::EmitOMPForSimdDirective(const OMPForSimdDirective &S) {
   bool HasLastprivates = false;
   auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF,
                                           PrePostActionTy &) {
-    if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
-                     [](const OMPReductionClause *C) {
-                       return C->getModifier() == OMPC_REDUCTION_inscan;
-                     })) {
-      const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) {
-        OMPLocalDeclMapRAII Scope(CGF);
-        OMPLoopScope LoopScope(CGF, S);
-        return CGF.EmitScalarExpr(S.getNumIterations());
-      };
-      const auto &&FirstGen = [&S](CodeGenFunction &CGF) {
-        (void)CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                         emitForLoopBounds,
-                                         emitDispatchForLoopBounds);
-        // Emit an implicit barrier at the end.
-        CGF.CGM.getOpenMPRuntime().emitBarrierCall(CGF, S.getBeginLoc(),
-                                                   OMPD_for);
-      };
-      const auto &&SecondGen = [&S, &HasLastprivates](CodeGenFunction &CGF) {
-        HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                                     emitForLoopBounds,
-                                                     emitDispatchForLoopBounds);
-      };
-      emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen);
-    } else {
-      HasLastprivates = CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(),
-                                                   emitForLoopBounds,
-                                                   emitDispatchForLoopBounds);
-    }
+    HasLastprivates = emitWorksharingDirective(CGF, S, /*HasCancel=*/false);
   };
   {
     auto LPCRegion =
@@ -3621,9 +3606,7 @@ void CodeGenFunction::EmitOMPParallelForDirective(
   // directives: 'parallel' with 'for' directive.
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     Action.Enter(CGF);
-    OMPCancelStackRAII CancelRegion(CGF, OMPD_parallel_for, S.hasCancel());
-    CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
-                               emitDispatchForLoopBounds);
+    (void)emitWorksharingDirective(CGF, S, S.hasCancel());
   };
   {
     auto LPCRegion =

diff  --git a/clang/test/OpenMP/parallel_for_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_scan_codegen.cpp
new file mode 100644
index 000000000000..f5687ac55697
--- /dev/null
+++ b/clang/test/OpenMP/parallel_for_scan_codegen.cpp
@@ -0,0 +1,313 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void foo();
+void bar();
+
+// CHECK: define void @{{.*}}baz{{.*}}(i32 %n)
+void baz(int n) {
+  static float a[10];
+  static double b;
+
+  // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+  // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+
+  // CHECK: call i8* @llvm.stacksave()
+  // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
+
+  // float a_buffer[10][n];
+  // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
+
+  // double b_buffer[10];
+  // CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
+#pragma omp parallel for reduction(inscan, +:a[:n], b)
+  for (int i = 0; i < 10; ++i) {
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+    // CHECK: [[INPUT_PHASE:.+]]:
+    // CHECK: call void @{{.+}}foo{{.+}}()
+
+    // a_buffer[i][0..n] = a_priv[[0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_buffer[i] = b_priv;
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
+    // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
+    // CHECK: br label %[[LOOP_CONTINUE:.+]]
+
+    // CHECK: [[DISPATCH]]:
+    // CHECK: br label %[[INPUT_PHASE]]
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @__kmpc_barrier(
+    foo();
+#pragma omp scan inclusive(a[:n], b)
+    // CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01)
+    // CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]])
+    // CHECK: [[CEIL_LOG2_10_INT:%.+]] = fptoui double [[CEIL_LOG2_10]] to i32
+    // CHECK: br label %[[OUTER_BODY:[^,]+]]
+    // CHECK: [[OUTER_BODY]]:
+    // CHECK: [[K:%.+]] = phi i32 [ 0, %{{.+}} ], [ [[K_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[K2POW:%.+]] = phi i64 [ 1, %{{.+}} ], [ [[K2POW_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[CMP:%.+]] = icmp uge i64 9, [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY:[^,]+]], label %[[INNER_EXIT:[^,]+]]
+    // CHECK: [[INNER_BODY]]:
+    // CHECK: [[I:%.+]] = phi i64 [ 9, %[[OUTER_BODY]] ], [ [[I_PREV:%.+]], %{{.+}} ]
+
+    // a_buffer[i] += a_buffer[i-pow(2, k)];
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[I]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[A_BUF_END:%.+]] = getelementptr float, float* [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
+    // CHECK: [[ISEMPTY:%.+]] = icmp eq float* [[A_BUF_IDX]], [[A_BUF_END]]
+    // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
+    // CHECK: [[RED_BODY]]:
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_ELEM:%.+]] = phi float* [ [[A_BUF_IDX_SUB_K2POW]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_SUB_K2POW_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_ELEM:%.+]] = phi float* [ [[A_BUF_IDX]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_VAL:%.+]] = load float, float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_VAL:%.+]] = load float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]],
+    // CHECK: [[RED:%.+]] = fadd float [[A_BUF_IDX_VAL]], [[A_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store float [[RED]], float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_NEXT]] = getelementptr float, float* [[A_BUF_IDX_ELEM]], i32 1
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_NEXT]] = getelementptr float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]], i32 1
+    // CHECK: [[DONE:%.+]] = icmp eq float* [[A_BUF_IDX_NEXT]], [[A_BUF_END]]
+    // CHECK: br i1 [[DONE]], label %[[RED_DONE]], label %[[RED_BODY]]
+    // CHECK: [[RED_DONE]]:
+
+    // b_buffer[i] += b_buffer[i-pow(2, k)];
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: [[B_BUF_IDX_SUB_K2POW_VAL:%.+]] = load double, double* [[B_BUF_IDX_SUB_K2POW]],
+    // CHECK: [[RED:%.+]] = fadd double [[B_BUF_IDX_VAL]], [[B_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store double [[RED]], double* [[B_BUF_IDX]],
+
+    // --i;
+    // CHECK: [[I_PREV:%.+]] = sub nuw i64 [[I]], 1
+    // CHECK: [[CMP:%.+]] = icmp uge i64 [[I_PREV]], [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY]], label %[[INNER_EXIT]]
+    // CHECK: [[INNER_EXIT]]:
+
+    // ++k;
+    // CHECK: [[K_NEXT]] = add nuw i32 [[K]], 1
+    // k2pow <<= 1;
+    // CHECK: [[K2POW_NEXT]] = shl nuw i64 [[K2POW]], 1
+    // CHECK: [[CMP:%.+]] = icmp ne i32 [[K_NEXT]], [[CEIL_LOG2_10_INT]]
+    // CHECK: br i1 [[CMP]], label %[[OUTER_BODY]], label %[[OUTER_EXIT:[^,]+]]
+    // CHECK: [[OUTER_EXIT]]:
+    bar();
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+
+    // Skip the before scan body.
+    // CHECK: call void @{{.+}}foo{{.+}}()
+
+    // CHECK: [[EXIT_INSCAN:[^,]+]]:
+    // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
+
+    // CHECK: [[DISPATCH]]:
+    // a_priv[[0..n] = a_buffer[i][0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_priv = b_buffer[i];
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: store double [[B_BUF_IDX_VAL]], double* [[B_PRIV_ADDR]],
+    // CHECK: br label %[[SCAN_PHASE:[^,]+]]
+
+    // CHECK: [[SCAN_PHASE]]:
+    // CHECK: call void @{{.+}}bar{{.+}}()
+    // CHECK: br label %[[EXIT_INSCAN]]
+
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @llvm.stackrestore(i8*
+  }
+
+  // CHECK: call i8* @llvm.stacksave()
+  // CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
+
+  // float a_buffer[10][n];
+  // CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
+
+  // double b_buffer[10];
+  // CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
+#pragma omp parallel for reduction(inscan, +:a[:n], b)
+  for (int i = 0; i < 10; ++i) {
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+
+    // Skip the before scan body.
+    // CHECK: call void @{{.+}}foo{{.+}}()
+
+    // CHECK: [[EXIT_INSCAN:[^,]+]]:
+
+    // a_buffer[i][0..n] = a_priv[[0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_buffer[i] = b_priv;
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
+    // CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
+    // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
+
+    // CHECK: [[DISPATCH]]:
+    // CHECK: br label %[[INPUT_PHASE:[^,]+]]
+
+    // CHECK: [[INPUT_PHASE]]:
+    // CHECK: call void @{{.+}}bar{{.+}}()
+    // CHECK: br label %[[EXIT_INSCAN]]
+
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @__kmpc_barrier(
+    foo();
+#pragma omp scan exclusive(a[:n], b)
+    // CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01)
+    // CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]])
+    // CHECK: [[CEIL_LOG2_10_INT:%.+]] = fptoui double [[CEIL_LOG2_10]] to i32
+    // CHECK: br label %[[OUTER_BODY:[^,]+]]
+    // CHECK: [[OUTER_BODY]]:
+    // CHECK: [[K:%.+]] = phi i32 [ 0, %{{.+}} ], [ [[K_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[K2POW:%.+]] = phi i64 [ 1, %{{.+}} ], [ [[K2POW_NEXT:%.+]], %{{.+}} ]
+    // CHECK: [[CMP:%.+]] = icmp uge i64 9, [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY:[^,]+]], label %[[INNER_EXIT:[^,]+]]
+    // CHECK: [[INNER_BODY]]:
+    // CHECK: [[I:%.+]] = phi i64 [ 9, %[[OUTER_BODY]] ], [ [[I_PREV:%.+]], %{{.+}} ]
+
+    // a_buffer[i] += a_buffer[i-pow(2, k)];
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[I]]
+    // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[A_BUF_END:%.+]] = getelementptr float, float* [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
+    // CHECK: [[ISEMPTY:%.+]] = icmp eq float* [[A_BUF_IDX]], [[A_BUF_END]]
+    // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
+    // CHECK: [[RED_BODY]]:
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_ELEM:%.+]] = phi float* [ [[A_BUF_IDX_SUB_K2POW]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_SUB_K2POW_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_ELEM:%.+]] = phi float* [ [[A_BUF_IDX]], %[[INNER_BODY]] ], [ [[A_BUF_IDX_NEXT:%.+]], %[[RED_BODY]] ]
+    // CHECK: [[A_BUF_IDX_VAL:%.+]] = load float, float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_VAL:%.+]] = load float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]],
+    // CHECK: [[RED:%.+]] = fadd float [[A_BUF_IDX_VAL]], [[A_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store float [[RED]], float* [[A_BUF_IDX_ELEM]],
+    // CHECK: [[A_BUF_IDX_NEXT]] = getelementptr float, float* [[A_BUF_IDX_ELEM]], i32 1
+    // CHECK: [[A_BUF_IDX_SUB_K2POW_NEXT]] = getelementptr float, float* [[A_BUF_IDX_SUB_K2POW_ELEM]], i32 1
+    // CHECK: [[DONE:%.+]] = icmp eq float* [[A_BUF_IDX_NEXT]], [[A_BUF_END]]
+    // CHECK: br i1 [[DONE]], label %[[RED_DONE]], label %[[RED_BODY]]
+    // CHECK: [[RED_DONE]]:
+
+    // b_buffer[i] += b_buffer[i-pow(2, k)];
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: [[B_BUF_IDX_SUB_K2POW_VAL:%.+]] = load double, double* [[B_BUF_IDX_SUB_K2POW]],
+    // CHECK: [[RED:%.+]] = fadd double [[B_BUF_IDX_VAL]], [[B_BUF_IDX_SUB_K2POW_VAL]]
+    // CHECK: store double [[RED]], double* [[B_BUF_IDX]],
+
+    // --i;
+    // CHECK: [[I_PREV:%.+]] = sub nuw i64 [[I]], 1
+    // CHECK: [[CMP:%.+]] = icmp uge i64 [[I_PREV]], [[K2POW]]
+    // CHECK: br i1 [[CMP]], label %[[INNER_BODY]], label %[[INNER_EXIT]]
+    // CHECK: [[INNER_EXIT]]:
+
+    // ++k;
+    // CHECK: [[K_NEXT]] = add nuw i32 [[K]], 1
+    // k2pow <<= 1;
+    // CHECK: [[K2POW_NEXT]] = shl nuw i64 [[K2POW]], 1
+    // CHECK: [[CMP:%.+]] = icmp ne i32 [[K_NEXT]], [[CEIL_LOG2_10_INT]]
+    // CHECK: br i1 [[CMP]], label %[[OUTER_BODY]], label %[[OUTER_EXIT:[^,]+]]
+    // CHECK: [[OUTER_EXIT]]:
+    bar();
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call i8* @llvm.stacksave()
+    // CHECK: store float 0.000000e+00, float* %
+    // CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
+    // CHECK: br label %[[DISPATCH:[^,]+]]
+
+    // CHECK: [[SCAN_PHASE:.+]]:
+    // CHECK: call void @{{.+}}foo{{.+}}()
+    // CHECK: br label %[[LOOP_CONTINUE:.+]]
+
+    // CHECK: [[DISPATCH]]:
+    // if (i >0)
+    //   a_priv[[0..n] = a_buffer[i-1][0..n];
+    // CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
+    // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
+    // CHECK: [[CMP:%.+]] = icmp eq i64 [[BASE_IDX]], 0
+    // CHECK: br i1 [[CMP]], label %[[IF_DONE:[^,]+]], label %[[IF_THEN:[^,]+]]
+    // CHECK: [[IF_THEN]]:
+    // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
+    // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
+    // CHECK: [[DEST:%.+]] = bitcast float* [[A_PRIV]] to i8*
+    // CHECK: [[SRC:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
+
+    // b_priv = b_buffer[i];
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, double* [[B_BUF_IDX]],
+    // CHECK: store double [[B_BUF_IDX_VAL]], double* [[B_PRIV_ADDR]],
+    // CHECK: br label %[[SCAN_PHASE]]
+
+    // CHECK: [[LOOP_CONTINUE]]:
+    // CHECK: call void @llvm.stackrestore(i8* %
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: call void @llvm.stackrestore(i8*
+  }
+}
+
+#endif
+


        


More information about the cfe-commits mailing list