[flang-commits] [flang] [flang][mlir] add missing type conversion when lowering atomiccas (PR #164865)

Thu Oct 23 11:08:20 PDT 2025

https://github.com/atmnp updated https://github.com/llvm/llvm-project/pull/164865

>From 2840ee5ca53c5643ceb3b03d0680bf6a466d650a Mon Sep 17 00:00:00 2001
From: Atmn Patel <atmnp at nvidia.com>
Date: Thu, 23 Oct 2025 10:47:03 -0700
Subject: [PATCH] [flang][mlir] add missing type conversion when lowering
 atomiccas

When lowering `atomiccas`, flang does not convert the
output of the `llvm.extract_value` op to result type expected in the
expression being lowered. This results in invalid MLIR being
generated such as when the output of the atomiccas is being used for an
equality check in a `do while` loop condition, where the `arith.cmpi`
would be comparing an `i64 0` with an `i1`. This change ensures that the
appropriate cast is inserted.
---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  4 +++-
 flang/test/Lower/CUDA/cuda-device-proc.cuf    | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6b02fefb92196..39bac818fe5d0 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -3106,7 +3106,9 @@ IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
           .getResult(0);
   auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
       builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
-  return mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  mlir::Value boolResult =
+      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  return builder.createConvert(loc, resultType, boolResult);
 }
 
 mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 7d6caf58d71b3..5c4c3c6d39820 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -479,3 +479,16 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+
+attributes(device) subroutine testAtomicCasLoop(aa, n)
+  integer :: a
+  do while (atomiccas(a, 0, 1) == 1)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtestatomiccasloop
+! CHECK: %[[CMP_XCHG:.*]] = llvm.cmpxchg %15, %c0_i32, %c1_i32 acq_rel monotonic : !llvm.ptr, i32
+! CHECK: %[[CMP_XCHG_EV:.*]] = llvm.extractvalue %[[CMP_XCHG]][1] : !llvm.struct<(i32, i1)> 
+! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32
+! CHECK: %{{.*}} = arith.constant 1 : i32
+! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32