[clang] [OpenMP] Allow GPUs to be targeted directly via `-fopenmp`. (PR #122149)

Wed Jan 8 10:23:50 PST 2025

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/122149

Summary:
Currently we prevent the following from working. However, it is
completely reasonable to be able to target the files individually.
```
$ clang --target=amdgcn-amd-amdhsa -fopenmp
```

This patch lifts this restriction, allowing individual files to be
compiled as standalone OpenMP without the extra offloading overhead. The
main motivation behind this is to update the build of the OpenMP
DeviceRTL. Currently, we do `--offload-device-only -S -emit-llvm` which
is just a hackier version of `-fopenmp -flto -c`.

This patch allows the following to work.
```
$ clang omp.c -fopenmp --target=amdgcn-amd-amdhsa -flto -c
$ clang offload.c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xoffload-linker omp.o
$ ./a.out
```


>From 378ee8272939f4a1e5418b309daf404a6c4f2f00 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 8 Jan 2025 12:19:53 -0600
Subject: [PATCH] [OpenMP] Allow GPUs to be targeted directly via `-fopenmp`.

Summary:
Currently we prevent the following from working. However, it is
completely reasonable to be able to target the files individually.
```
$ clang --target=amdgcn-amd-amdhsa -fopenmp
```

This patch lifts this restriction, allowing individual files to be
compiled as standalone OpenMP without the extra offloading overhead. The
main motivation behind this is to update the build of the OpenMP
DeviceRTL. Currently, we do `--offload-device-only -S -emit-llvm` which
is just a hackier version of `-fopenmp -flto -c`.

This patch allows the following to work.
```
$ clang omp.c -fopenmp --target=amdgcn-amd-amdhsa -flto -c
$ clang offload.c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xoffload-linker omp.o
$ ./a.out
```
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp  |   3 -
 clang/lib/CodeGen/CGStmtOpenMP.cpp        |   3 +-
 clang/lib/CodeGen/CodeGenModule.cpp       |   2 -
 clang/lib/Frontend/CompilerInvocation.cpp |  13 --
 clang/test/OpenMP/gpu_target.cpp          | 220 ++++++++++++++++++++++
 clang/test/OpenMP/target_messages.cpp     |   1 -
 6 files changed, 222 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/OpenMP/gpu_target.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 756f0482b8ea72..1ad4b4b0e8a7fc 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -870,9 +870,6 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
       hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
   OMPBuilder.setConfig(Config);
 
-  if (!CGM.getLangOpts().OpenMPIsTargetDevice)
-    llvm_unreachable("OpenMP can only handle device code.");
-
   if (CGM.getLangOpts().OpenMPCUDAMode)
     CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
 
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 6cb37b20b7aeee..950ed173aecf3a 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6801,7 +6801,8 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
   CodeGenModule &CGM = CGF.CGM;
 
   // On device emit this construct as inlined code.
-  if (CGM.getLangOpts().OpenMPIsTargetDevice) {
+  if (CGM.getLangOpts().OpenMPIsTargetDevice ||
+      CGM.getOpenMPRuntime().isGPU()) {
     OMPLexicalScope Scope(CGF, S, OMPD_target);
     CGM.getOpenMPRuntime().emitInlinedDirective(
         CGF, OMPD_target, [&S](CodeGenFunction &CGF, PrePostActionTy &) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 5f15f0f48c54e4..26abd9a60632ae 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -483,8 +483,6 @@ void CodeGenModule::createOpenMPRuntime() {
   case llvm::Triple::nvptx:
   case llvm::Triple::nvptx64:
   case llvm::Triple::amdgcn:
-    assert(getLangOpts().OpenMPIsTargetDevice &&
-           "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
     OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
     break;
   default:
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d711df02ce9503..d2df51593ff62b 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4210,19 +4210,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
             Args, OPT_fopenmp_version_EQ,
             (IsSimdSpecified || IsTargetSpecified) ? 51 : Opts.OpenMP, Diags))
       Opts.OpenMP = Version;
-    // Provide diagnostic when a given target is not expected to be an OpenMP
-    // device or host.
-    if (!Opts.OpenMPIsTargetDevice) {
-      switch (T.getArch()) {
-      default:
-        break;
-      // Add unsupported host targets here:
-      case llvm::Triple::nvptx:
-      case llvm::Triple::nvptx64:
-        Diags.Report(diag::err_drv_omp_host_target_not_supported) << T.str();
-        break;
-      }
-    }
   }
 
   // Set the flag to prevent the implementation from emitting device exception
diff --git a/clang/test/OpenMP/gpu_target.cpp b/clang/test/OpenMP/gpu_target.cpp
new file mode 100644
index 00000000000000..3d5a47d7050436
--- /dev/null
+++ b/clang/test/OpenMP/gpu_target.cpp
@@ -0,0 +1,220 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --version 5
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX
+
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
+} omp_allocator_handle_t;
+
+int d = 0;
+#pragma omp allocate(d) allocator(omp_default_mem_alloc)
+
+int g = 0;
+#pragma omp allocate(g) allocator(omp_cgroup_mem_alloc)
+
+extern const int c = 0;
+#pragma omp allocate(c) allocator(omp_const_mem_alloc)
+
+
+int foo() {
+  int t = 0;
+#pragma omp allocate(t) allocator(omp_thread_mem_alloc)
+  return t;
+}
+
+void bar() {
+#pragma omp target
+  ;
+#pragma omp parallel
+  ;
+}
+
+void baz(int *p) {
+#pragma omp atomic
+  *p += 1;
+}
+
+int qux() {
+#if defined(__NVPTX__)
+  return 1;
+#elif defined(__AMDGPU__)
+  return 2;
+#endif
+}
+//.
+// AMDGCN: @c = addrspace(4) constant i32 0, align 4
+// AMDGCN: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// AMDGCN: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// AMDGCN: @d = global i32 0, align 4
+// AMDGCN: @g = global i32 0, align 4
+// AMDGCN: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// NVPTX: @d = global i32 0, align 4
+// NVPTX: @g = global i32 0, align 4
+// NVPTX: @c = addrspace(4) constant i32 0, align 4
+// NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+//.
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z3foov(
+// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[T:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    [[T_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[T]] to ptr
+// AMDGCN-NEXT:    store i32 0, ptr [[T_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_ASCAST]], align 4
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+//
+// AMDGCN-LABEL: define dso_local void @_Z3barv(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// AMDGCN-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// AMDGCN-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0)
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined(
+// AMDGCN-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// AMDGCN-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// AMDGCN-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined_wrapper(
+// AMDGCN-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// AMDGCN-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// AMDGCN-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// AMDGCN-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// AMDGCN-NEXT:    [[GLOBAL_ARGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
+// AMDGCN-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR_ASCAST]], align 2
+// AMDGCN-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// AMDGCN-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_ASCAST]])
+// AMDGCN-NEXT:    call void @_Z3barv_omp_outlined(ptr [[DOTADDR1_ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define dso_local void @_Z3bazPi(
+// AMDGCN-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
+// AMDGCN-NEXT:    store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z3quxv(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    ret i32 2
+//
+//
+// NVPTX-LABEL: define dso_local noundef i32 @_Z3foov(
+// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[T:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    store i32 0, ptr [[T]], align 4
+// NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T]], align 4
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+//
+// NVPTX-LABEL: define dso_local void @_Z3barv(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined(
+// NVPTX-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// NVPTX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined_wrapper(
+// NVPTX-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+// NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
+// NVPTX-NEXT:    call void @_Z3barv_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3:[0-9]+]]
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define dso_local void @_Z3bazPi(
+// NVPTX-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define dso_local noundef i32 @_Z3quxv(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    ret i32 1
+//
+//.
+// AMDGCN: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR3]] = { nounwind }
+// AMDGCN: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
+//.
+// NVPTX: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR3]] = { nounwind }
+// NVPTX: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
+//.
+// AMDGCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
+// AMDGCN: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 45}
+// AMDGCN: [[META3:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// NVPTX: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// NVPTX: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 45}
+// NVPTX: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/OpenMP/target_messages.cpp b/clang/test/OpenMP/target_messages.cpp
index 62ab5817ba28da..7e68adddc949e0 100644
--- a/clang/test/OpenMP/target_messages.cpp
+++ b/clang/test/OpenMP/target_messages.cpp
@@ -8,7 +8,6 @@
 // CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
 // RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx64-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
 // RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
-// CHECK-UNSUPPORTED-HOST-TARGET: error: target '{{nvptx64-nvidia-cuda|nvptx-nvidia-cuda}}' is not a supported OpenMP host target
 // RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=hexagon-linux-gnu -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-DEVICE-TARGET %s
 // CHECK-UNSUPPORTED-DEVICE-TARGET: OpenMP target is invalid: 'hexagon-linux-gnu'