[llvm] [LOH] Emit hints for LDP/STP instructions (PR #141297)

via llvm-commits llvm-commits at lists.llvm.org
Fri May 23 15:16:09 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Ellis Hoag (ellishg)

<details>
<summary>Changes</summary>

Support more load/store instructions for `.loh` directives. Note that these new instructions are not supported in LLD yet, so they will be skipped for now.
https://github.com/llvm/llvm-project/blob/1695e8b3d1080cea089baa74b2c3c7fd469c62c8/lld/MachO/Arch/ARM64.cpp#L283-L314

In a large binary, we saw `NumADDToSTR` increase from 167 to 1021 and `NumLDRToSTR` from 14 to 46. I believe this shows the potential for improvement.

---
Full diff: https://github.com/llvm/llvm-project/pull/141297.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64CollectLOH.cpp (+57-1) 
- (modified) llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll (+20-12) 
- (modified) llvm/test/CodeGen/AArch64/arm64-collect-loh.ll (+153) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index c3370cd6e946c..ee95efdfbee1e 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -193,6 +193,7 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
   switch (MI.getOpcode()) {
   default:
     return false;
+  // STR
   case AArch64::STRBBui:
   case AArch64::STRHHui:
   case AArch64::STRBui:
@@ -202,12 +203,37 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
+  // STUR
+  case AArch64::STURBi:
+  case AArch64::STURBBi:
+  case AArch64::STURHi:
+  case AArch64::STURHHi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
     // We can only optimize the index operand.
     // In case we have str xA, [xA, #imm], this is two different uses
     // of xA and we cannot fold, otherwise the xA stored may be wrong,
     // even if #imm == 0.
     return MO.getOperandNo() == 1 &&
            MI.getOperand(0).getReg() != MI.getOperand(1).getReg();
+  // STP
+  case AArch64::STPWi:
+  case AArch64::STPXi:
+  case AArch64::STPSi:
+  case AArch64::STPDi:
+  case AArch64::STPQi:
+  // STNP
+  case AArch64::STNPWi:
+  case AArch64::STNPXi:
+  case AArch64::STNPSi:
+  case AArch64::STNPDi:
+  case AArch64::STNPQi:
+    return MO.getOperandNo() == 2 &&
+           MI.getOperand(0).getReg() != MI.getOperand(2).getReg() &&
+           MI.getOperand(1).getReg() != MI.getOperand(2).getReg();
   }
 }
 
@@ -217,6 +243,7 @@ static bool isCandidateLoad(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
+  // LDR
   case AArch64::LDRSBWui:
   case AArch64::LDRSBXui:
   case AArch64::LDRSHWui:
@@ -229,11 +256,40 @@ static bool isCandidateLoad(const MachineInstr &MI) {
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
+  // LDUR
+  case AArch64::LDURBBi:
+  case AArch64::LDURBi:
+  case AArch64::LDURDi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURHi:
+  case AArch64::LDURQi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
     return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT);
+  // LDP
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+  // LDNP
+  case AArch64::LDNPSi:
+  case AArch64::LDNPDi:
+  case AArch64::LDNPQi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPXi:
+    return !(MI.getOperand(3).getTargetFlags() & AArch64II::MO_GOT);
   }
 }
 
-/// Check whether the given instruction can load a litteral.
+/// Check whether the given instruction can load a literal.
 static bool supportLoadFromLiteral(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index acc0df12a94e8..f8b469efe5afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
-; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
+; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
@@ -7,18 +7,26 @@
 ; at least provide a wrong one (with the offset folded
 ; into the definition).
 
-%struct.anon = type { ptr, ptr }
+ at A = internal global i32 0, align 4
 
- at pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
-
-; CHECK-LABEL: _pptp_wan_init
-; CHECK: ret
-; CHECK-NOT: AdrpAddStr
-define i32 @pptp_wan_init() {
+define void @str() {
 entry:
-  store ptr null, ptr @pptp_wan_head, align 8
-  store ptr @pptp_wan_head, ptr getelementptr inbounds (%struct.anon, ptr @pptp_wan_head, i64 0, i32 1), align 8
-  ret i32 0
+  store ptr @A, ptr @A, align 4
+  ret void
 }
 
+define void @stp0(i64 %t) {
+entry:
+  %addr = getelementptr inbounds i64, ptr @A, i32 1
+  store ptr @A, ptr @A, align 4
+  store i64 %t, ptr %addr, align 4
+  ret void
+}
 
+define void @stp1(i64 %t) {
+entry:
+  %addr = getelementptr inbounds i64, ptr @A, i32 1
+  store i64 %t, ptr @A, align 4
+  store ptr @A, ptr %addr, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 7f2bebf584d8f..6ac899fb41896 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -71,6 +71,34 @@ define i32 @getC() {
   ret i32 %res
 }
 
+; CHECK-LABEL: _getCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCPair() {
+  %res = load <8 x i32>, ptr @C, align 4
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCNontemporalPair() {
+  %res = load <8 x i32>, ptr @C, align 4, !nontemporal !0
+  ret <8 x i32> %res
+}
+
 ; LDRSW supports loading from a literal.
 ; Make sure we emit AdrpLdrGotLdr for those.
 ; CHECK-LABEL: _getSExtC
@@ -126,6 +154,36 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: _setCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCPair(<8 x i32> %t) {
+entry:
+  store <8 x i32> %t, ptr @C, align 4
+  ret void
+}
+
+; CHECK-LABEL: _setCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCNontemporalPair(<8 x i32> %t) {
+entry:
+  store <8 x i32> %t, ptr @C, align 4, !nontemporal !0
+  ret void
+}
+
 ; Perform the same tests for internal global and a displacement
 ; in the addressing mode.
 ; Indeed we will get an ADD for those instead of LOADGot.
@@ -148,6 +206,51 @@ define i32 @getInternalCPlus4() {
   ret i32 %res
 }
 
+; CHECK-LABEL: _getInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalCUnscaled() {
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+  %res = load i32, ptr %addr, align 4
+  ret i32 %res
+}
+
+; CHECK-LABEL: _getInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCPair() {
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  %res = load <8 x i32>, ptr %addr, align 4
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCNontemporalPair() {
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  %res = load <8 x i32>, ptr %addr, align 4, !nontemporal !0
+  ret <8 x i32> %res
+}
+
 ; LDRSW supports loading from a literal.
 ; Make sure we emit AdrpLdrGotLdr for those.
 ; CHECK-LABEL: _getSExtInternalCPlus4
@@ -206,6 +309,54 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: _setInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCUnscaled(i32 %t) {
+entry:
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+  store i32 %t, ptr %addr, align 4
+  ret void
+}
+
+; CHECK-LABEL: _setInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCPair(<8 x i32> %t) {
+entry:
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  store <8 x i32> %t, ptr %addr, align 4
+  ret void
+}
+
+; CHECK-LABEL: _setInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @_setInternalCNontemporalPair(<8 x i32> %t) {
+entry:
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  store <8 x i32> %t, ptr %addr, align 4, !nontemporal !0
+  ret void
+}
+
 ; Check that we catch AdrpAddLdr case when we have a simple chain:
 ; adrp -> ldr.
 ; CHECK-LABEL: _getInternalC
@@ -679,4 +830,6 @@ if.end.i:
 }
 declare void @callee(ptr nocapture readonly, ...)
 
+!0 = !{ i32 1 }
+
 attributes #0 = { "target-cpu"="cyclone" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/141297


More information about the llvm-commits mailing list