[llvm] [LOH] Emit hints for LDP/STP instructions (PR #141297)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 23 15:16:09 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Ellis Hoag (ellishg)
<details>
<summary>Changes</summary>
Support more load/store instructions for `.loh` directives. Note that these new instructions are not supported in LLD yet, so they will be skipped for now.
https://github.com/llvm/llvm-project/blob/1695e8b3d1080cea089baa74b2c3c7fd469c62c8/lld/MachO/Arch/ARM64.cpp#L283-L314
In a large binary, we saw `NumADDToSTR` increase from 167 to 1021 and `NumLDRToSTR` from 14 to 46. I believe this shows the potential for improvement.
---
Full diff: https://github.com/llvm/llvm-project/pull/141297.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64CollectLOH.cpp (+57-1)
- (modified) llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll (+20-12)
- (modified) llvm/test/CodeGen/AArch64/arm64-collect-loh.ll (+153)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index c3370cd6e946c..ee95efdfbee1e 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -193,6 +193,7 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
switch (MI.getOpcode()) {
default:
return false;
+ // STR
case AArch64::STRBBui:
case AArch64::STRHHui:
case AArch64::STRBui:
@@ -202,12 +203,37 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
+ // STUR
+ case AArch64::STURBi:
+ case AArch64::STURBBi:
+ case AArch64::STURHi:
+ case AArch64::STURHHi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
// We can only optimize the index operand.
// In case we have str xA, [xA, #imm], this is two different uses
// of xA and we cannot fold, otherwise the xA stored may be wrong,
// even if #imm == 0.
return MO.getOperandNo() == 1 &&
MI.getOperand(0).getReg() != MI.getOperand(1).getReg();
+ // STP
+ case AArch64::STPWi:
+ case AArch64::STPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ // STNP
+ case AArch64::STNPWi:
+ case AArch64::STNPXi:
+ case AArch64::STNPSi:
+ case AArch64::STNPDi:
+ case AArch64::STNPQi:
+ return MO.getOperandNo() == 2 &&
+ MI.getOperand(0).getReg() != MI.getOperand(2).getReg() &&
+ MI.getOperand(1).getReg() != MI.getOperand(2).getReg();
}
}
@@ -217,6 +243,7 @@ static bool isCandidateLoad(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
return false;
+ // LDR
case AArch64::LDRSBWui:
case AArch64::LDRSBXui:
case AArch64::LDRSHWui:
@@ -229,11 +256,40 @@ static bool isCandidateLoad(const MachineInstr &MI) {
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
+ // LDUR
+ case AArch64::LDURBBi:
+ case AArch64::LDURBi:
+ case AArch64::LDURDi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURHi:
+ case AArch64::LDURQi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT);
+ // LDP
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ // LDNP
+ case AArch64::LDNPSi:
+ case AArch64::LDNPDi:
+ case AArch64::LDNPQi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPXi:
+ return !(MI.getOperand(3).getTargetFlags() & AArch64II::MO_GOT);
}
}
-/// Check whether the given instruction can load a litteral.
+/// Check whether the given instruction can load a literal.
static bool supportLoadFromLiteral(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index acc0df12a94e8..f8b469efe5afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
-; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
+; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
; Test case for <rdar://problem/15942912>.
; AdrpAddStr cannot be used when the store uses same
; register as address and value. Indeed, the related
@@ -7,18 +7,26 @@
; at least provide a wrong one (with the offset folded
; into the definition).
-%struct.anon = type { ptr, ptr }
+ at A = internal global i32 0, align 4
- at pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
-
-; CHECK-LABEL: _pptp_wan_init
-; CHECK: ret
-; CHECK-NOT: AdrpAddStr
-define i32 @pptp_wan_init() {
+define void @str() {
entry:
- store ptr null, ptr @pptp_wan_head, align 8
- store ptr @pptp_wan_head, ptr getelementptr inbounds (%struct.anon, ptr @pptp_wan_head, i64 0, i32 1), align 8
- ret i32 0
+ store ptr @A, ptr @A, align 4
+ ret void
}
+define void @stp0(i64 %t) {
+entry:
+ %addr = getelementptr inbounds i64, ptr @A, i32 1
+ store ptr @A, ptr @A, align 4
+ store i64 %t, ptr %addr, align 4
+ ret void
+}
+define void @stp1(i64 %t) {
+entry:
+ %addr = getelementptr inbounds i64, ptr @A, i32 1
+ store i64 %t, ptr @A, align 4
+ store ptr @A, ptr %addr, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 7f2bebf584d8f..6ac899fb41896 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -71,6 +71,34 @@ define i32 @getC() {
ret i32 %res
}
+; CHECK-LABEL: _getCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCPair() {
+ %res = load <8 x i32>, ptr @C, align 4
+ ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCNontemporalPair() {
+ %res = load <8 x i32>, ptr @C, align 4, !nontemporal !0
+ ret <8 x i32> %res
+}
+
; LDRSW supports loading from a literal.
; Make sure we emit AdrpLdrGotLdr for those.
; CHECK-LABEL: _getSExtC
@@ -126,6 +154,36 @@ entry:
ret void
}
+; CHECK-LABEL: _setCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCPair(<8 x i32> %t) {
+entry:
+ store <8 x i32> %t, ptr @C, align 4
+ ret void
+}
+
+; CHECK-LABEL: _setCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCNontemporalPair(<8 x i32> %t) {
+entry:
+ store <8 x i32> %t, ptr @C, align 4, !nontemporal !0
+ ret void
+}
+
; Perform the same tests for internal global and a displacement
; in the addressing mode.
; Indeed we will get an ADD for those instead of LOADGot.
@@ -148,6 +206,51 @@ define i32 @getInternalCPlus4() {
ret i32 %res
}
+; CHECK-LABEL: _getInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalCUnscaled() {
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+ %res = load i32, ptr %addr, align 4
+ ret i32 %res
+}
+
+; CHECK-LABEL: _getInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCPair() {
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ %res = load <8 x i32>, ptr %addr, align 4
+ ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCNontemporalPair() {
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ %res = load <8 x i32>, ptr %addr, align 4, !nontemporal !0
+ ret <8 x i32> %res
+}
+
; LDRSW supports loading from a literal.
; Make sure we emit AdrpLdrGotLdr for those.
; CHECK-LABEL: _getSExtInternalCPlus4
@@ -206,6 +309,54 @@ entry:
ret void
}
+; CHECK-LABEL: _setInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCUnscaled(i32 %t) {
+entry:
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+ store i32 %t, ptr %addr, align 4
+ ret void
+}
+
+; CHECK-LABEL: _setInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCPair(<8 x i32> %t) {
+entry:
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ store <8 x i32> %t, ptr %addr, align 4
+ ret void
+}
+
+; CHECK-LABEL: _setInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @_setInternalCNontemporalPair(<8 x i32> %t) {
+entry:
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ store <8 x i32> %t, ptr %addr, align 4, !nontemporal !0
+ ret void
+}
+
; Check that we catch AdrpAddLdr case when we have a simple chain:
; adrp -> ldr.
; CHECK-LABEL: _getInternalC
@@ -679,4 +830,6 @@ if.end.i:
}
declare void @callee(ptr nocapture readonly, ...)
+!0 = !{ i32 1 }
+
attributes #0 = { "target-cpu"="cyclone" }
``````````
</details>
https://github.com/llvm/llvm-project/pull/141297
More information about the llvm-commits
mailing list