[flang-commits] [flang] 2befcb4 - [flang] Restrict O0 hlfir.assign scalar-to-array inlining to OpenMP target device (#201774)

Mon Jun 8 09:06:42 PDT 2026

Author: Sairudra More
Date: 2026-06-08T21:36:35+05:30
New Revision: 2befcb4d3224a78e0676c6cac29c37125088643d

URL: https://github.com/llvm/llvm-project/commit/2befcb4d3224a78e0676c6cac29c37125088643d
DIFF: https://github.com/llvm/llvm-project/commit/2befcb4d3224a78e0676c6cac29c37125088643d.diff

LOG: [flang] Restrict O0 hlfir.assign scalar-to-array inlining to OpenMP target device (#201774)

Follow-up to #197092.

That PR fixed the OpenMP target-device case by running scalar-to-array
`hlfir.assign` inlining at `-O0`, so device code would not lower through
`_FortranAAssign`.

However, running that path for normal host `-g -O0` also changed
debugging behavior: a breakpoint on a scalar broadcast like `arr = 11`
could be hit once per array element because the assignment became an
inline loop.

This patch restricts the `-O0` scalar-RHS-only inlining path to OpenMP
target-device compilation. Host `-O0` keeps the existing
`_FortranAAssign` path and debugger behavior, while device `-O0` still
avoids the runtime call.

Two focused tests cover both sides:
- host `-O0` scalar broadcast keeps `_FortranAAssign`
- OpenMP target-device `-O0` scalar broadcast avoids `_FortranAAssign`

Fixing debug locations for generated inline loops more generally is left
as separate follow-up work.
The broader debug-location issue for generated inline assignment loops
is tracked separately in #202065.

Added: 
    flang/test/Lower/OpenMP/scalar-to-array-assign-target-device-O0.f90
    flang/test/Lower/scalar-to-array-assign-host-O0.f90

Modified: 
    flang/include/flang/Tools/CrossToolHelpers.h
    flang/lib/Frontend/FrontendActions.cpp
    flang/lib/Optimizer/Passes/Pipelines.cpp
    flang/test/Driver/mlir-debug-pass-pipeline.f90
    flang/test/Driver/mlir-pass-pipeline.f90
    flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
    flang/test/Integration/OpenMP/private-global.f90
    flang/test/Integration/prefetch.f90
    flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90
    flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90
    flang/tools/bbc/bbc.cpp

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 6240354bd899a..90e159cc157bf 100644

--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -141,6 +141,8 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
                                       ///< functions.
   bool NSWOnLoopVarInc = true; ///< Add nsw flag to loop variable increments.
   bool EnableOpenMP = false; ///< Enable OpenMP lowering.
+  bool EnableOpenMPIsTargetDevice =
+      false; ///< Compiling for an OpenMP target device.
   bool UseSampleProfile = false; ///< Enable sample based profiling
   bool DebugInfoForProfiling = false; ///< Enable extra debugging info
   bool EnableOpenMPSimd = false; ///< Enable OpenMP simd-only mode.

diff  --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 0d154a7157867..66602ed52f6cd 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -633,6 +633,8 @@ void CodeGenAction::lowerHLFIRToFIR() {
   MLIRToLLVMPassPipelineConfig config(level);
   config.fpMaxminBehavior =
       ci.getInvocation().getLoweringOpts().getFPMaxminBehavior();
+  if (ci.getInvocation().getLangOpts().OpenMPIsTargetDevice)
+    config.EnableOpenMPIsTargetDevice = true;
   // Create the pass pipeline
   fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, config);
   (void)mlir::applyPassManagerCLOptions(pm);
@@ -763,6 +765,9 @@ void CodeGenAction::generateLLVMIR() {
           Fortran::common::LanguageFeature::OpenMP))
     config.EnableOpenMP = true;
 
+  if (ci.getInvocation().getLangOpts().OpenMPIsTargetDevice)
+    config.EnableOpenMPIsTargetDevice = true;
+
   if (ci.getInvocation().getLangOpts().OpenMPSimd)
     config.EnableOpenMPSimd = true;
 

diff  --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 682e3e48e0a22..8e8521391885e 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -313,10 +313,14 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
       addNestedPassToAllTopLevelOperations<PassConstructor>(
           pm, hlfir::createInlineHLFIRCopyIn);
     }
-  } else {
-    // At O0, only inline scalar-to-array broadcasts. This avoids emitting
-    // Fortran runtime calls (e.g. _FortranAAssign) that use malloc/free in
-    // device code generated by OpenMP target offloading.
+  } else if (config.EnableOpenMPIsTargetDevice) {
+    // At O0, only inline scalar-to-array broadcasts when compiling for an
+    // OpenMP target device. This avoids emitting Fortran runtime calls
+    // (e.g. _FortranAAssign) that use malloc/free in device code generated
+    // by OpenMP target offloading. Restricting this to target-device
+    // compilation preserves the runtime call on the host at -O0 so that a
+    // line breakpoint on a scalar-to-array assignment hits once instead of
+    // once per element.
     addNestedPassToAllTopLevelOperations(pm, [&]() {
       return hlfir::createInlineHLFIRAssign({/*onlyScalarRHS=*/true});
     });

diff  --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index c5e63fdbd9d2b..d5126012b6957 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -32,23 +32,18 @@
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT:   SeparateAllocatableAssign
-! ALL-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT:   SeparateAllocatableAssign
-! ALL-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT: 'omp.declare_mapper' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT:   SeparateAllocatableAssign
-! ALL-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT:   SeparateAllocatableAssign
-! ALL-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT: 'omp.private' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT:   SeparateAllocatableAssign
-! ALL-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR

diff  --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index a7ea0a9de4867..b679564adff10 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -9,6 +9,11 @@
 
 end program
 
+! At -O0 on the host (no OpenMP target-device compilation), InlineHLFIRAssign
+! is no longer scheduled. See PR #197092 follow-up restricting the -O0 pass
+! to OpenMP target-device compilation.
+! O0-NOT: InlineHLFIRAssign
+
 ! ALL: Pass statistics report
 ! ALL: Fortran::lower::VerifierPass
 
@@ -32,27 +37,22 @@
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
 ! ALL-NEXT:  SeparateAllocatableAssign
-! O0-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT:'func.func' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
 ! ALL-NEXT:  SeparateAllocatableAssign
-! O0-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT:'omp.declare_mapper' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
 ! ALL-NEXT:  SeparateAllocatableAssign
-! O0-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT:'omp.declare_reduction' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
 ! ALL-NEXT:  SeparateAllocatableAssign
-! O0-NEXT:   InlineHLFIRAssign
 ! ALL-NEXT:'omp.private' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
 ! ALL-NEXT:  SeparateAllocatableAssign
-! O0-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: Canonicalizer
 ! O2-NEXT: CSE
 ! O2-NEXT: (S) {{.*}} num-cse'd

diff  --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
index c4688a6e8a192..94080d1d4f975 100644
--- a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -50,7 +50,7 @@ subroutine worst_case(a, b, c, d)
 ! CHECK:         br i1 %{{.*}}, label %omp.private.init3, label %omp.private.init4
 
 ! CHECK:       omp.private.init4:                               ; preds = %omp.private.init2
-!                [finish private alloc for first var with zero extent]
+!                [finish private alloc for second var with zero extent]
 ! CHECK:         br label %omp.private.init5
 
 ! CHECK:       omp.private.init5:                               ; preds = %omp.private.init3, %omp.private.init4
@@ -61,13 +61,13 @@ subroutine worst_case(a, b, c, d)
 ! CHECK-NEXT:    br label %omp.private.init7
 
 ! CHECK:       omp.private.init7:
-!                [begin private alloc for second var]
+!                [begin private alloc for first var]
 !                [read the length from the mold argument]
 !                [if it is non-zero...]
 ! CHECK:         br i1 {{.*}}, label %omp.private.init8, label %omp.private.init9
 
 ! CHECK:       omp.private.init9:                               ; preds = %omp.private.init7
-!                [finish private alloc for second var with zero extent]
+!                [finish private alloc for first var with zero extent]
 ! CHECK:         br label %omp.private.init10
 
 ! CHECK:       omp.private.init10:                               ; preds = %omp.private.init8, %omp.private.init9
@@ -109,60 +109,46 @@ subroutine worst_case(a, b, c, d)
 ! CHECK:         br label %[[VAL_96:.*]]
 
 ! CHECK:       omp.reduction.neutral:                            ; preds = %omp.reduction.init
-!                [start of reduction initialization region for first var]
+!                [start of reduction initialization region]
 !                [null check:]
 ! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral20, label %omp.reduction.neutral21
 
 ! CHECK:       omp.reduction.neutral21:                          ; preds = %omp.reduction.neutral
-!                [malloc the reduction variable]
+!                [malloc and assign the default value to the reduction variable]
 ! CHECK:         br label %omp.reduction.neutral22
 
-! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.reduction.neutral23, %omp.reduction.neutral21
-!                [inlined scalar-to-array init loop header]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24
-
-! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.reduction.neutral22
-! CHECK:         br label %omp.reduction.neutral25
-
-! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral20, %omp.reduction.neutral24
+! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.reduction.neutral20, %omp.reduction.neutral21
 ! CHECK-NEXT:    br label %omp.region.cont19
 
-! CHECK:       omp.region.cont19:                                ; preds = %omp.reduction.neutral25
+! CHECK:       omp.region.cont19:                                ; preds = %omp.reduction.neutral22
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.reduction.neutral27
+! CHECK-NEXT:    br label %omp.reduction.neutral24
 
-! CHECK:       omp.reduction.neutral27:                          ; preds = %omp.region.cont19
-!                [start of reduction initialization region for second var]
+! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.region.cont19
+!                [start of reduction initialization region]
 !                [null check:]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral28, label %omp.reduction.neutral29
-
-! CHECK:       omp.reduction.neutral29:                          ; preds = %omp.reduction.neutral27
-!                [malloc the reduction variable]
-! CHECK:         br label %omp.reduction.neutral30
-
-! CHECK:       omp.reduction.neutral30:                          ; preds = %omp.reduction.neutral31, %omp.reduction.neutral29
-!                [inlined scalar-to-array init loop header]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral31, label %omp.reduction.neutral32
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral25, label %omp.reduction.neutral26
 
-! CHECK:       omp.reduction.neutral32:                          ; preds = %omp.reduction.neutral30
-! CHECK:         br label %omp.reduction.neutral33
+! CHECK:       omp.reduction.neutral26:                          ; preds = %omp.reduction.neutral24
+!                [malloc and assign the default value to the reduction variable]
+! CHECK:         br label %omp.reduction.neutral27
 
-! CHECK:       omp.reduction.neutral33:                          ; preds = %omp.reduction.neutral28, %omp.reduction.neutral32
-! CHECK-NEXT:    br label %omp.region.cont26
+! CHECK:       omp.reduction.neutral27:                          ; preds = %omp.reduction.neutral25, %omp.reduction.neutral26
+! CHECK-NEXT:    br label %omp.region.cont23
 
-! CHECK:       omp.region.cont26:                                ; preds = %omp.reduction.neutral33
+! CHECK:       omp.region.cont23:                                ; preds = %omp.reduction.neutral27
 ! CHECK-NEXT:    %{{.*}} = phi ptr
-! CHECK-NEXT:    br label %omp.par.region35
+! CHECK-NEXT:    br label %omp.par.region29
 
-! CHECK:       omp.par.region35:                                 ; preds = %omp.region.cont26
+! CHECK:       omp.par.region29:                                 ; preds = %omp.region.cont23
 !                [call SUM runtime function]
 !                [if (sum(a) == 1)]
-! CHECK:         br i1 %{{.*}}, label %omp.par.region36, label %omp.par.region37
+! CHECK:         br i1 %{{.*}}, label %omp.par.region30, label %omp.par.region31
 
-! CHECK:       omp.par.region37:                                 ; preds = %omp.par.region35
-! CHECK-NEXT:    br label %omp.region.cont34
+! CHECK:       omp.par.region31:                                 ; preds = %omp.par.region29
+! CHECK-NEXT:    br label %omp.region.cont28
 
-! CHECK:       omp.region.cont34:                                ; preds = %omp.par.region36, %omp.par.region37
+! CHECK:       omp.region.cont28:                                ; preds = %omp.par.region30, %omp.par.region31
 !                [omp parallel region done, call into the runtime to complete reduction]
 ! CHECK:         %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
 ! CHECK:         switch i32 %[[VAL_233]], label %reduce.finalize [
@@ -170,16 +156,16 @@ subroutine worst_case(a, b, c, d)
 ! CHECK-NEXT:      i32 2, label %reduce.switch.atomic
 ! CHECK-NEXT:    ]
 
-! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont34
+! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont28
 ! CHECK-NEXT:    unreachable
 
-! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont34
+! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont28
 ! CHECK-NEXT:    %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
 ! CHECK-NEXT:    br label %omp.reduction.nonatomic.body
 
 !              [various blocks implementing the reduction]
 
-! CHECK:       omp.region.cont42:                                ; preds =
+! CHECK:       omp.region.cont36:                                ; preds =
 ! CHECK-NEXT:    %{{.*}} = phi ptr
 ! CHECK-NEXT:    call void @__kmpc_end_reduce(
 ! CHECK-NEXT:    br label %reduce.finalize
@@ -196,59 +182,29 @@ subroutine worst_case(a, b, c, d)
 
 ! CHECK:       omp.reduction.cleanup:                            ; preds = %.fini
 !                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup48, label %omp.reduction.cleanup49
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup42, label %omp.reduction.cleanup43
 
-! CHECK:       omp.reduction.cleanup49:                          ; preds = %omp.reduction.cleanup48, %omp.reduction.cleanup
-! CHECK-NEXT:    br label %omp.region.cont47
+! CHECK:       omp.reduction.cleanup43:                          ; preds = %omp.reduction.cleanup42, %omp.reduction.cleanup
+! CHECK-NEXT:    br label %omp.region.cont41
 
-! CHECK:       omp.region.cont47:                                ; preds = %omp.reduction.cleanup49
-! CHECK:         br label %omp.reduction.cleanup51
+! CHECK:       omp.region.cont41:                                ; preds = %omp.reduction.cleanup43
+! CHECK-NEXT:    %{{.*}} = load ptr, ptr
+! CHECK-NEXT:    br label %omp.reduction.cleanup45
 
-! CHECK:       omp.reduction.cleanup51:                          ; preds = %omp.region.cont47
+! CHECK:       omp.reduction.cleanup45:                          ; preds = %omp.region.cont41
 !                [null check]
-! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup52, label %omp.reduction.cleanup53
-
-! CHECK:       omp.reduction.cleanup53:                          ; preds = %omp.reduction.cleanup52, %omp.reduction.cleanup51
-! CHECK-NEXT:    br label %omp.region.cont50
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup46, label %omp.reduction.cleanup47
 
-! CHECK:       omp.region.cont50:                                ; preds = %omp.reduction.cleanup53
-! CHECK-NEXT:    br label %omp.private.dealloc
-
-! CHECK:       omp.private.dealloc:                              ; preds = %omp.region.cont50
-!                [null check for first private var dealloc]
-! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc55, label %omp.private.dealloc56
-
-! CHECK:       omp.private.dealloc56:                            ; preds = %omp.private.dealloc55, %omp.private.dealloc
-! CHECK-NEXT:    br label %omp.region.cont54
-
-! CHECK:       omp.region.cont54:                                ; preds = %omp.private.dealloc56
-! CHECK-NEXT:    br label %omp.private.dealloc58
-
-! CHECK:       omp.private.dealloc58:                            ; preds = %omp.region.cont54
-!                [null check for second private var dealloc]
-! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc59, label %omp.private.dealloc60
-
-! CHECK:       omp.private.dealloc60:                            ; preds = %omp.private.dealloc59, %omp.private.dealloc58
-! CHECK-NEXT:    br label %omp.region.cont57
-
-! CHECK:       omp.par.region36:                                 ; preds = %omp.par.region35
+! CHECK:       omp.par.region30:                                 ; preds = %omp.par.region29
 ! CHECK-NEXT:    call void @_FortranAStopStatement
 
-! CHECK:       omp.reduction.neutral31:                          ; preds = %omp.reduction.neutral30
-!                [inlined init loop body for second var]
-! CHECK:         br label %omp.reduction.neutral30
-
-! CHECK:       omp.reduction.neutral28:                          ; preds = %omp.reduction.neutral27
-!                [source length was zero: finish initializing second var]
-! CHECK:         br label %omp.reduction.neutral33
-
-! CHECK:       omp.reduction.neutral23:                          ; preds = %omp.reduction.neutral22
-!                [inlined init loop body for first var]
-! CHECK:         br label %omp.reduction.neutral22
+! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral24
+!                [source length was zero: finish initializing array]
+! CHECK:         br label %omp.reduction.neutral27
 
 ! CHECK:       omp.reduction.neutral20:                          ; preds = %omp.reduction.neutral
-!                [source length was zero: finish initializing first var]
-! CHECK:         br label %omp.reduction.neutral25
+!                [source length was zero: finish initializing array]
+! CHECK:         br label %omp.reduction.neutral22
 
 ! CHECK:       omp.private.copy17:                               ; preds = %omp.private.copy16
 !                [source length was non-zero: call assign runtime]
@@ -266,5 +222,5 @@ subroutine worst_case(a, b, c, d)
 !                [var extent was non-zero: malloc a private array]
 ! CHECK:         br label %omp.private.init5
 
-! CHECK:       omp.par.exit.exitStub:                           ; preds = %omp.region.cont57
+! CHECK:       omp.par.exit.exitStub:                           ; preds = %omp.region.cont51
 ! CHECK-NEXT:    ret void

diff  --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90
index 4b27e6ddc79a4..ed11a95c4aeb1 100644
--- a/flang/test/Integration/OpenMP/private-global.f90
+++ b/flang/test/Integration/OpenMP/private-global.f90
@@ -17,21 +17,34 @@ program bug
 
 ! CHECK-LABEL: define internal void {{.*}}..omp_par(
 ! CHECK:       omp.par.entry:
+! CHECK:         %[[VAL_9:.*]] = alloca i32, align 4
+! CHECK:         %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4
+! CHECK:         store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4
+! CHECK:         %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4
 ! CHECK:         %[[PRIV_BOX_ALLOC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[ELEMENTAL_TMP_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+! CHECK:         %[[FIFTY:.*]] = alloca i32, i64 1, align 4
+! CHECK:         %[[INTERMEDIATE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
 ! ...
-! check that the private copy is allocated via malloc
-! CHECK:       omp.private.init:
-! CHECK:         %[[PRIV_TABLE:.*]] = call ptr @malloc(i64 40)
-! ...
-! check that we use the private copy of table for the assignment (table = 50)
-! The assignment is now inlined as a loop instead of calling _FortranAAssign.
+! check that we use the private copy of table for the assignment
 ! CHECK:       omp.par.region1:
-! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr{{.*}}%[[BOX_COPY:.*]], ptr{{.*}}%[[PRIV_BOX_ALLOC]], i32 48, i1 false)
-! ...
-! check that we use the private copy of table for table/=50 (inlined loop body)
-! CHECK:       omp.par.region6:
-! CHECK:         %[[VAL_44:.*]] = sub {{.*}} i64 %{{.*}}, 1
+! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr{{.*}}%[[INTERMEDIATE]], ptr{{.*}}%[[PRIV_BOX_ALLOC]], i32 {{4[48]}}, i1 false)
+! CHECK:         store i32 50, ptr %[[FIFTY]], align 4
+! CHECK:         %[[FIFTY_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 4, i32 20240719, i8 0, i8 9, i8 0, i8 0 }, ptr %[[FIFTY]], 0
+! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8 } %[[FIFTY_BOX_VAL]], ptr %[[BOXED_FIFTY]], align {{[48]}}
+! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[INTERMEDIATE]], i32 {{4[48]}}, i1 false)
+! CHECK:         call void @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
+! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr{{.*}}%[[TABLE_BOX_ADDR]], ptr{{.*}}%[[PRIV_BOX_ALLOC]], i32 {{4[48]}}, i1 false)
+! CHECK:         %[[PRIV_TABLE:.*]] = call ptr @malloc(i{{(32)|(64)}} 40)
 ! ...
-! check that we store 50 into the private table's elements (inlined loop body)
+! check that we use the private copy of table for table/=50
 ! CHECK:       omp.par.region3:
-! CHECK:         store i32 50, ptr %{{.*}}, align 4
+! CHECK:         %[[VAL_44:.*]] = sub nuw nsw i64 %{{.*}}, 1
+! CHECK:         %[[VAL_45:.*]] = mul nuw nsw i64 %[[VAL_44]], 1
+! CHECK:         %[[VAL_46:.*]] = mul nuw nsw i64 %[[VAL_45]], 1
+! CHECK:         %[[VAL_47:.*]] = add nuw nsw i64 %[[VAL_46]], 0
+! CHECK:         %[[VAL_48:.*]] = getelementptr nusw nuw i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]]

diff  --git a/flang/test/Integration/prefetch.f90 b/flang/test/Integration/prefetch.f90
index 76227caf02b43..c015b6736972a 100644
--- a/flang/test/Integration/prefetch.f90
+++ b/flang/test/Integration/prefetch.f90
@@ -13,6 +13,7 @@
 !===============================================================================
 
 subroutine test_prefetch_01()
+    ! LLVM: {{.*}} = alloca i32, i64 1, align 4
     ! LLVM: %[[VAR_J:.*]] = alloca i32, i64 1, align 4
     ! LLVM: %[[VAR_I:.*]] = alloca i32, i64 1, align 4
     ! LLVM: %[[VAR_A:.*]] = alloca [256 x i32], i64 1, align 4

diff  --git a/flang/test/Lower/OpenMP/scalar-to-array-assign-target-device-O0.f90 b/flang/test/Lower/OpenMP/scalar-to-array-assign-target-device-O0.f90
new file mode 100644
index 0000000000000..db019a6a15ab1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/scalar-to-array-assign-target-device-O0.f90
@@ -0,0 +1,18 @@
+! Regression test for PR llvm/llvm-project#197092 and its follow-up.
+!
+! When compiling for an OpenMP target device at -O0, a scalar-to-array
+! broadcast assignment inside a target region must still be inlined to
+! avoid emitting a _FortranAAssign runtime call (which internally uses
+! malloc/free) into GPU device code.
+
+! RUN: %flang_fc1 -emit-fir -O0 -fopenmp -fopenmp-is-target-device %s -o - \
+! RUN:   | FileCheck %s --implicit-check-not="fir.call @_FortranAAssign"
+
+subroutine device_scalar_broadcast()
+  integer :: arr(4)
+  !$omp target map(tofrom: arr)
+  ! CHECK: omp.target
+  ! CHECK: fir.do_loop
+  arr = 11
+  !$omp end target
+end subroutine

diff  --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90
index cbb4dfc3cdadc..fd02e9c234180 100644
--- a/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90
+++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90
@@ -1,4 +1,6 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -O1 -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -O0 -fopenmp -fopenmp-version=60 %s -o - \
+! RUN:   | FileCheck %s --check-prefix=CHECK-O0
 
 ! CHECK-LABEL: func @_QPtarget_teams_workdistribute
 subroutine target_teams_workdistribute()
@@ -57,3 +59,21 @@ subroutine teams_workdistribute()
 
   !$omp end teams workdistribute
 end subroutine teams_workdistribute
+
+! At -O0 host (no -fopenmp-is-target-device):
+!   - target teams workdistribute inlines both the saxpy and the scalar
+!     broadcast via workdistributeRuntimeCallLower, so no _FortranAAssign
+!     runtime call appears.
+!   - teams workdistribute (no target) keeps the saxpy result write-back
+!     and the scalar broadcast as _FortranAAssign runtime calls. The
+!     two calls correspond to source lines 50 (saxpy) and 58 (scalar
+!     broadcast) respectively; the ordered CHECK-O0 lines below match
+!     each region in source order so neither expectation can accidentally
+!     match the other assignment.
+! CHECK-O0-LABEL: func @_QPtarget_teams_workdistribute
+! CHECK-O0-NOT: fir.call @_FortranAAssign
+
+! CHECK-O0-LABEL: func @_QPteams_workdistribute
+! CHECK-O0: omp.wsloop
+! CHECK-O0: fir.call @_FortranAAssign({{.*}}%c50_i32)
+! CHECK-O0: fir.call @_FortranAAssign({{.*}}%c58_i32)

diff  --git a/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90
index 217df8fb05176..68c9060ee9bd6 100644
--- a/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90
+++ b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90
@@ -1,4 +1,7 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s --implicit-check-not="fir.call @_FortranAAssign"
+! RUN: %flang_fc1 -emit-fir -O1 -fopenmp -fopenmp-version=60 %s -o - \
+! RUN:   | FileCheck %s --implicit-check-not="fir.call @_FortranAAssign"
+! RUN: %flang_fc1 -emit-fir -O0 -fopenmp -fopenmp-version=60 %s -o - \
+! RUN:   | FileCheck %s --check-prefix=CHECK-O0
 
 ! CHECK-LABEL: func @_QPtarget_teams_workdistribute_scalar_assign
 subroutine target_teams_workdistribute_scalar_assign()
@@ -31,3 +34,17 @@ subroutine teams_workdistribute_scalar_assign()
   !$omp end teams workdistribute
 
 end subroutine teams_workdistribute_scalar_assign
+
+! At -O0 host (no -fopenmp-is-target-device), target teams workdistribute
+! still goes through workdistributeRuntimeCallLower, so the scalar
+! broadcast inside the target region is inlined and workshared, with no
+! _FortranAAssign runtime call.
+! CHECK-O0-LABEL: func @_QPtarget_teams_workdistribute_scalar_assign
+! CHECK-O0: omp.wsloop
+! CHECK-O0-NOT: fir.call @_FortranAAssign
+
+! At -O0 host, teams workdistribute (no target) does not inline the
+! scalar broadcast after PR #201774, so the assignment remains a
+! _FortranAAssign runtime call. This pins the plain host -O0 behavior.
+! CHECK-O0-LABEL: func @_QPteams_workdistribute_scalar_assign
+! CHECK-O0: fir.call @_FortranAAssign

diff  --git a/flang/test/Lower/scalar-to-array-assign-host-O0.f90 b/flang/test/Lower/scalar-to-array-assign-host-O0.f90
new file mode 100644
index 0000000000000..88d4344da6f2b
--- /dev/null
+++ b/flang/test/Lower/scalar-to-array-assign-host-O0.f90
@@ -0,0 +1,17 @@
+! Regression test for the follow-up to PR llvm/llvm-project#197092.
+!
+! At -O0 on the host (no OpenMP target-device compilation), a scalar-to-array
+! broadcast assignment must lower to a Fortran runtime call
+! (_FortranAAssign), not to an inline assignment loop. Lowering it inline
+! at -O0 caused -g line breakpoints to hit once per array element instead
+! of once.
+
+! RUN: %flang_fc1 -emit-fir -O0 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPhost_scalar_broadcast
+subroutine host_scalar_broadcast(arr)
+  integer :: arr(4)
+  ! CHECK: fir.call @_FortranAAssign
+  ! CHECK-NOT: fir.do_loop
+  arr = 11
+end subroutine

diff  --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 30b4a99c8f2d5..23e7af238198f 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -576,6 +576,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
     config.SkipConvertComplexPow = targetMachine.getTargetTriple().isAMDGCN();
     if (enableOpenMP)
       config.EnableOpenMP = true;
+    if (enableOpenMPDevice)
+      config.EnableOpenMPIsTargetDevice = true;
     config.NSWOnLoopVarInc = !integerWrapAround;
     fir::registerDefaultInlinerPass(config);
     fir::createDefaultFIROptimizerPassPipeline(pm, config);