[llvm] [DAGISel] Keep flags when converting FP load/store to integer (PR #111679)

Wed Oct 9 06:48:18 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: Oliver Stannard (ostannard)

<details>
<summary>Changes</summary>

This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.

---
Full diff: https://github.com/llvm/llvm-project/pull/111679.diff


4 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+4-6) 
- (modified) llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll (+8-8) 
- (added) llvm/test/CodeGen/ARM/load-store-pair-volatile.ll (+24) 
- (modified) llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll (+5-5) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 348db9ca7c430e..810ca458bc8787 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20322,13 +20322,11 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
         !FastLD || !FastST)
       return SDValue();
 
-    SDValue NewLD =
-        DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
-                    LD->getPointerInfo(), LD->getAlign());
+    SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(),
+                                LD->getBasePtr(), LD->getMemOperand());
 
-    SDValue NewST =
-        DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
-                     ST->getPointerInfo(), ST->getAlign());
+    SDValue NewST = DAG.getStore(ST->getChain(), SDLoc(N), NewLD,
+                                 ST->getBasePtr(), ST->getMemOperand());
 
     AddToWorklist(NewLD.getNode());
     AddToWorklist(NewST.getNode());
diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
index 782894976c711c..b5f0b2ff9ef4cf 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
@@ -9,7 +9,7 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -36,7 +36,7 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -63,7 +63,7 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -90,7 +90,7 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -117,7 +117,7 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -144,7 +144,7 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -172,7 +172,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x84
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -184,7 +184,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x84
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll b/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll
new file mode 100644
index 00000000000000..6278672d9e2397
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=arm-none-eabi -stop-after=finalize-isel < %s | FileCheck %s
+
+define void @test(ptr %vol_one, ptr %p_in, ptr %p_out, i32 %n) {
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   [[LDRi12_:%[0-9]+]]:gpr = LDRi12 [[COPY1]], 0, 14 /* CC::al */, $noreg :: (load (s32) from %ir.p_in)
+  ; CHECK-NEXT:   STRi12 killed [[LDRi12_]], [[COPY2]], 0, 14 /* CC::al */, $noreg :: (volatile store (s32) into %ir.vol_one)
+  ; CHECK-NEXT:   [[LDRi12_1:%[0-9]+]]:gpr = LDRi12 [[COPY2]], 4, 14 /* CC::al */, $noreg :: (volatile load (s32) from %ir.vol_two)
+  ; CHECK-NEXT:   STRi12 killed [[LDRi12_1]], [[COPY]], 0, 14 /* CC::al */, $noreg :: (store (s32) into %ir.p_out)
+  ; CHECK-NEXT:   MOVPCLR 14 /* CC::al */, $noreg
+entry:
+  %vol_two = getelementptr inbounds i8, ptr %vol_one, i32 4
+  %a = load float, ptr %p_in, align 4
+  store volatile float %a, ptr %vol_one, align 4
+  %b = load volatile float, ptr %vol_two, align 4
+  store float %b, ptr %p_out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
index ccc36530c7957b..150fa91524ab0c 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -1442,8 +1442,8 @@ define void @caller_fpr_stack() {
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15)
   ; 32BIT-NEXT:   renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (load (s32) from @f14)
-  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (load (s32) from @f16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @f16)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   renamable $r5 = LI 0
   ; 32BIT-NEXT:   renamable $r6 = LIS 16352
@@ -1532,9 +1532,9 @@ define void @caller_fpr_stack() {
   ; 64BIT-NEXT:   renamable $x3 = LDtoc @f14, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $x4 = LDtoc @d15, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $x5 = LDtoc @f16, $x2 :: (load (s64) from got)
-  ; 64BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $x3 :: (load (s32) from @f14)
-  ; 64BIT-NEXT:   renamable $x4 = LD 0, killed renamable $x4 :: (load (s64) from @d15)
-  ; 64BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $x5 :: (load (s32) from @f16)
+  ; 64BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $x3 :: (dereferenceable load (s32) from @f14)
+  ; 64BIT-NEXT:   renamable $x4 = LD 0, killed renamable $x4 :: (dereferenceable load (s64) from @d15)
+  ; 64BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $x5 :: (dereferenceable load (s32) from @f16)
   ; 64BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   STW killed renamable $r5, 168, $x1 :: (store (s32))

``````````

</details>


https://github.com/llvm/llvm-project/pull/111679