[clang] [llvm] [OpenMP] OpenMP 5.1 "assume" directive parsing support (PR #92731)

Fri Jun 21 07:56:48 PDT 2024

https://github.com/jtb20 updated https://github.com/llvm/llvm-project/pull/92731

>From 822249a1f45ce1341e71a9c99dec081d8e8d077f Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 12 Jun 2024 13:58:22 -0500
Subject: [PATCH] [OpenMP] Diagnostic check for imperfect loop collapse

This patch adds a diagnostic which attempts to detect the case where the
"collapse" clause is used with imperfectly-nested parallel loops,
something like this:

  #pragma omp target
  #pragma omp parallel for collapse(2)
    for (int i = 0; i < N; i++) {
      arr[i][i] = ...;
      for (int j = 0; j < N; j++) {
        arr[i][j] = ...;
      }
    }

This kind of nesting is permitted by OpenMP 5+.

At a glance, this appears fine: the outer loop iterations are
independent, so can be executed in parallel, and the inner loop
iterations are also independent and can be executed in parallel.

However, the "collapse" clause works by essentially moving the
not-perfectly-nested statements into the innermost loop.  This is
sometimes harmless but inefficient (the statement gets executed more times
than a naive user might expect), but in this case the combined/collapsed
loop iterations now have a data dependency between them:

  for (int ij = 0; ij < N*N; ij++) {
    int i = ij / N, j = ij % N;
    arr[i][i] = ...; // all of these...
    arr[i][j] = ...; // ...would have to be executed before all of these
  }

...and that means the result is (silently!) incorrect.

Since this seems like an easy mistake to make, I was interested to find
out if there was a feasible and reasonably-accurate way to try to
diagnose it.  This is what I came up with.

Firstly, in Clang, memory load/store instructions emitted from
statements in the "imperfect" parts of loop nests are annotated with
a new annotation, "llvm.omp.loop.imperfection".  Then in LLVM proper,
in the OpenMPOpt pass (because I couldn't find anywhere that looked more
appropriate), memory load/store instructions in collapsed loops are
partitioned into two groups, with or without the annotation.  Then if
any of the first group may/must alias with any in the second group,
a warning (actually a "remark") is emitted.

The remark is opt-in.  The user must compile with
"-Rpass-analysis=openmp-opt" to trigger it.  That seems appropriate,
because the diagnostic potentially has a false-positive rate that is too
high for a regular warning, but on the other hand users aren't likely to
benefit from the true-positive warning unless they know to use the option.
Comments welcome.

FWIW, I don't think there's a reasonable, safe way to collapse loops
like this and maintain parallel semantics, but ICBW.
---
 clang/lib/CodeGen/CGStmtOpenMP.cpp           |  7 ++-
 clang/lib/CodeGen/CodeGenFunction.cpp        | 14 ++++-
 clang/lib/CodeGen/CodeGenFunction.h          | 22 +++++++
 clang/test/OpenMP/for_collapse_imperfect.cpp | 65 ++++++++++++++++++++
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp        | 56 +++++++++++++++++
 5 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/OpenMP/for_collapse_imperfect.cpp

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index f73d32de7c484..4a210bbea734c 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1898,7 +1898,12 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop,
       return;
     }
   }
-  CGF.EmitStmt(S);
+  if (SimplifiedS != NextLoop) {
+    CodeGenFunction::OMPLoopImperfectionRAII OLI(CGF);
+    CGF.EmitStmt(S);
+  } else {
+    CGF.EmitStmt(S);
+  }
 }
 
 void CodeGenFunction::EmitOMPLoopBody(const OMPLoopDirective &D,
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 200c40da8bc43..dc945ce26b99c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -44,6 +44,8 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
@@ -2647,8 +2649,18 @@ void CGBuilderInserter::InsertHelper(
     llvm::Instruction *I, const llvm::Twine &Name,
     llvm::BasicBlock::iterator InsertPt) const {
   llvm::IRBuilderDefaultInserter::InsertHelper(I, Name, InsertPt);
-  if (CGF)
+  if (CGF) {
     CGF->InsertHelper(I, Name, InsertPt);
+    if (CGF->GetOMPLoopImperfection() &&
+        I->mayReadOrWriteMemory()) {
+      llvm::LLVMContext &Ctx = CGF->getLLVMContext();
+      llvm::MDNode *Imp = llvm::MDNode::get(Ctx,
+                                            llvm::ConstantAsMetadata::get(
+                                              llvm::ConstantInt::get(
+                                              llvm::Type::getInt1Ty(Ctx), 1)));
+      I->setMetadata("llvm.omp.loop.imperfection", Imp);
+    }
+  }
 }
 
 // Emits an error if we don't have a valid set of target features for the
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index cdb5ae6663405..e97fecb09c240 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3633,6 +3633,22 @@ class CodeGenFunction : public CodeGenTypeCache {
     ~OMPCancelStackRAII() { CGF.OMPCancelStack.exit(CGF); }
   };
 
+  /// Controls emission of "llvm.omp.loop.imperfection" metadata on
+  /// load/store instructions.
+  class OMPLoopImperfectionRAII {
+    CodeGenFunction &CGF;
+    bool OldValue;
+
+  public:
+    OMPLoopImperfectionRAII(CodeGenFunction &CGF) : CGF(CGF) {
+      OldValue = CGF.GetOMPLoopImperfection();
+      CGF.SetOMPLoopImperfection (true);
+    }
+    ~OMPLoopImperfectionRAII() {
+      CGF.SetOMPLoopImperfection (OldValue);
+    }
+  };
+
   /// Returns calculated size of the specified type.
   llvm::Value *getTypeSize(QualType Ty);
   LValue InitCapturedStruct(const CapturedStmt &S);
@@ -3991,6 +4007,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// Emits the lvalue for the expression with possibly captured variable.
   LValue EmitOMPSharedLValue(const Expr *E);
 
+  bool GetOMPLoopImperfection() { return OMPLoopImperfection; }
+  void SetOMPLoopImperfection(bool I) { OMPLoopImperfection = I; }
+
 private:
   /// Helpers for blocks.
   llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info);
@@ -4031,6 +4050,9 @@ class CodeGenFunction : public CodeGenTypeCache {
           IncExpr(IncExpr), Init(Init), Cond(Cond), NextLB(NextLB),
           NextUB(NextUB) {}
   };
+
+  bool OMPLoopImperfection = false;
+
   void EmitOMPOuterLoop(bool DynamicOrOrdered, bool IsMonotonic,
                         const OMPLoopDirective &S, OMPPrivateScope &LoopScope,
                         const OMPLoopArguments &LoopArgs,
diff --git a/clang/test/OpenMP/for_collapse_imperfect.cpp b/clang/test/OpenMP/for_collapse_imperfect.cpp
new file mode 100644
index 0000000000000..4da6e3dbdbb0a
--- /dev/null
+++ b/clang/test/OpenMP/for_collapse_imperfect.cpp
@@ -0,0 +1,65 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -verify=host -O2 -triple x86_64-unknown-unknown -Rpass-analysis=openmp-opt -fopenmp -x c++ -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86_64-host.bc
+// RUN: %clang_cc1 -verify=analysis -O2 -triple amdgcn-amd-amdhsa -Rpass-analysis=openmp-opt -fopenmp -x c++ -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86_64-host.bc -o %t.out
+
+// host-no-diagnostics
+
+#define N 256
+
+int main() {
+  double arr[N][N];
+  double b[N];
+  float c[N];
+
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      arr[j][i] = 0.0;
+
+  // These nested loops look parallelisable at a glance, but if they are
+  // collapsed, iterations are no longer data-independent with respect to each
+  // other.  So we emit a remark saying so.
+#pragma omp target map(tofrom: arr)
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < N; i++) {
+    arr[i][i] = i * 10; // #0
+    // analysis-remark@#0 {{Collapsing imperfectly-nested loop may introduce unexpected data dependencies}}
+    for (int j = 0; j < N; j++) {
+      arr[i][j]++;
+    }
+  }
+
+  // This is fine, the declaration of 'f' can't affect the array 'arr'.
+#pragma omp target map(tofrom: arr)
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < N; i++) {
+    double f = i * 10;
+    for (int j = 0; j < N; j++) {
+      arr[i][j] += (i == j) ? f : 1;
+    }
+  }
+
+  // The accesses in this loop could be disambiguated, but currently aren't.
+  // So this is a false positive for the remark.
+#pragma omp target map(tofrom: arr, b[0:N])
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < N; i++) {
+    b[i] = i; // #1
+    // analysis-remark@#1 {{Collapsing imperfectly-nested loop may introduce unexpected data dependencies}}
+    for (int j = 0; j < N; j++) {
+      arr[i][j]++;
+    }
+  }
+
+  // This is fine though, presumably TBAA takes care of it.  No remark emitted.
+#pragma omp target map(tofrom: arr, c[0:N])
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < N; i++) {
+    c[i] = i;
+    for (int j = 0; j < N; j++) {
+      arr[i][j]++;
+    }
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e3a4821b8226b..60acc35623205 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -5721,6 +5721,62 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
 
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  // !!! This pass may be invoked at several points in the compilation
+  // pipeline, but we only want to emit these remarks once.  The outermost
+  // condition below is a somewhat crude attempt at ensuring that.
+  // Note that if -save-temps is used, duplicate remarks may be shown.
+  if (isOpenMPDevice(M)) {
+    for (Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+      DominatorTree DT(F);
+      LoopInfo LI(DT);
+      for (const auto &L : LI) {
+        SmallVector<const Instruction *, 4> ImperfectMemInsns;
+        SmallVector<const Instruction *, 4> OtherMemInsns;
+        if (L->getName().starts_with("omp")) {
+          const auto &BBs = L->getBlocksVector();
+          for (const auto &BB : BBs) {
+            for (const auto &I : *BB) {
+              if (I.mayReadOrWriteMemory()) {
+                if (I.hasMetadata("llvm.omp.loop.imperfection")) {
+                  ImperfectMemInsns.push_back(&I);
+                } else {
+                  OtherMemInsns.push_back(&I);
+                }
+              }
+            }
+          }
+        }
+        if (!ImperfectMemInsns.empty()) {
+          AliasAnalysis &AA = FAM.getResult<AAManager>(F);
+          const Instruction *BadInsn = nullptr;
+          for (auto &O : OtherMemInsns) {
+            MemoryLocation OML = MemoryLocation::get(O);
+            for (auto &I : ImperfectMemInsns) {
+              MemoryLocation IML = MemoryLocation::get(I);
+              if (!AA.isNoAlias(OML, IML)) {
+                BadInsn = I;
+                break;
+              }
+            }
+            if (BadInsn)
+              break;
+          }
+          if (BadInsn) {
+            auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+            OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP190", BadInsn);
+            ORE.emit([&ORA]() {
+              return ORA << "Collapsing imperfectly-nested loop may "
+                            "introduce unexpected data dependencies";
+            });
+          }
+        }
+      }
+    }
+  }
+
   KernelSet Kernels = getDeviceKernels(M);
 
   if (PrintModuleBeforeOptimizations)