[llvm] [LOH] Emit hints for LDP/STP instructions (PR #141297)

Ellis Hoag via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 3 11:16:16 PDT 2025


https://github.com/ellishg updated https://github.com/llvm/llvm-project/pull/141297

>From d04840cdc994a17961a6f9d7d76d8476f029d3d0 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellishoag at meta.com>
Date: Fri, 23 May 2025 14:51:50 -0700
Subject: [PATCH 1/2] [LOH] Emit hits for LDP/STP instructions

---
 llvm/lib/Target/AArch64/AArch64CollectLOH.cpp |  58 ++++++-
 .../CodeGen/AArch64/arm64-collect-loh-str.ll  |  32 ++--
 .../test/CodeGen/AArch64/arm64-collect-loh.ll | 153 ++++++++++++++++++
 3 files changed, 230 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 64f21c4cb2297..ba0dc843d942d 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -192,6 +192,7 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
   switch (MI.getOpcode()) {
   default:
     return false;
+  // STR
   case AArch64::STRBBui:
   case AArch64::STRHHui:
   case AArch64::STRBui:
@@ -201,12 +202,37 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
+  // STUR
+  case AArch64::STURBi:
+  case AArch64::STURBBi:
+  case AArch64::STURHi:
+  case AArch64::STURHHi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
     // We can only optimize the index operand.
     // In case we have str xA, [xA, #imm], this is two different uses
     // of xA and we cannot fold, otherwise the xA stored may be wrong,
     // even if #imm == 0.
     return MO.getOperandNo() == 1 &&
            MI.getOperand(0).getReg() != MI.getOperand(1).getReg();
+  // STP
+  case AArch64::STPWi:
+  case AArch64::STPXi:
+  case AArch64::STPSi:
+  case AArch64::STPDi:
+  case AArch64::STPQi:
+  // STNP
+  case AArch64::STNPWi:
+  case AArch64::STNPXi:
+  case AArch64::STNPSi:
+  case AArch64::STNPDi:
+  case AArch64::STNPQi:
+    return MO.getOperandNo() == 2 &&
+           MI.getOperand(0).getReg() != MI.getOperand(2).getReg() &&
+           MI.getOperand(1).getReg() != MI.getOperand(2).getReg();
   }
 }
 
@@ -216,6 +242,7 @@ static bool isCandidateLoad(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
+  // LDR
   case AArch64::LDRSBWui:
   case AArch64::LDRSBXui:
   case AArch64::LDRSHWui:
@@ -228,11 +255,40 @@ static bool isCandidateLoad(const MachineInstr &MI) {
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
+  // LDUR
+  case AArch64::LDURBBi:
+  case AArch64::LDURBi:
+  case AArch64::LDURDi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURHi:
+  case AArch64::LDURQi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
     return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT);
+  // LDP
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+  // LDNP
+  case AArch64::LDNPSi:
+  case AArch64::LDNPDi:
+  case AArch64::LDNPQi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPXi:
+    return !(MI.getOperand(3).getTargetFlags() & AArch64II::MO_GOT);
   }
 }
 
-/// Check whether the given instruction can load a litteral.
+/// Check whether the given instruction can load a literal.
 static bool supportLoadFromLiteral(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index acc0df12a94e8..f8b469efe5afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
-; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
+; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
@@ -7,18 +7,26 @@
 ; at least provide a wrong one (with the offset folded
 ; into the definition).
 
-%struct.anon = type { ptr, ptr }
+ at A = internal global i32 0, align 4
 
- at pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
-
-; CHECK-LABEL: _pptp_wan_init
-; CHECK: ret
-; CHECK-NOT: AdrpAddStr
-define i32 @pptp_wan_init() {
+define void @str() {
 entry:
-  store ptr null, ptr @pptp_wan_head, align 8
-  store ptr @pptp_wan_head, ptr getelementptr inbounds (%struct.anon, ptr @pptp_wan_head, i64 0, i32 1), align 8
-  ret i32 0
+  store ptr @A, ptr @A, align 4
+  ret void
 }
 
+define void @stp0(i64 %t) {
+entry:
+  %addr = getelementptr inbounds i64, ptr @A, i32 1
+  store ptr @A, ptr @A, align 4
+  store i64 %t, ptr %addr, align 4
+  ret void
+}
 
+define void @stp1(i64 %t) {
+entry:
+  %addr = getelementptr inbounds i64, ptr @A, i32 1
+  store i64 %t, ptr @A, align 4
+  store ptr @A, ptr %addr, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 7f2bebf584d8f..6ac899fb41896 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -71,6 +71,34 @@ define i32 @getC() {
   ret i32 %res
 }
 
+; CHECK-LABEL: _getCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCPair() {
+  %res = load <8 x i32>, ptr @C, align 4
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCNontemporalPair() {
+  %res = load <8 x i32>, ptr @C, align 4, !nontemporal !0
+  ret <8 x i32> %res
+}
+
 ; LDRSW supports loading from a literal.
 ; Make sure we emit AdrpLdrGotLdr for those.
 ; CHECK-LABEL: _getSExtC
@@ -126,6 +154,36 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: _setCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCPair(<8 x i32> %t) {
+entry:
+  store <8 x i32> %t, ptr @C, align 4
+  ret void
+}
+
+; CHECK-LABEL: _setCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C at GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C at GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCNontemporalPair(<8 x i32> %t) {
+entry:
+  store <8 x i32> %t, ptr @C, align 4, !nontemporal !0
+  ret void
+}
+
 ; Perform the same tests for internal global and a displacement
 ; in the addressing mode.
 ; Indeed we will get an ADD for those instead of LOADGot.
@@ -148,6 +206,51 @@ define i32 @getInternalCPlus4() {
   ret i32 %res
 }
 
+; CHECK-LABEL: _getInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalCUnscaled() {
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+  %res = load i32, ptr %addr, align 4
+  ret i32 %res
+}
+
+; CHECK-LABEL: _getInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCPair() {
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  %res = load <8 x i32>, ptr %addr, align 4
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCNontemporalPair() {
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  %res = load <8 x i32>, ptr %addr, align 4, !nontemporal !0
+  ret <8 x i32> %res
+}
+
 ; LDRSW supports loading from a literal.
 ; Make sure we emit AdrpLdrGotLdr for those.
 ; CHECK-LABEL: _getSExtInternalCPlus4
@@ -206,6 +309,54 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: _setInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCUnscaled(i32 %t) {
+entry:
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+  store i32 %t, ptr %addr, align 4
+  ret void
+}
+
+; CHECK-LABEL: _setInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCPair(<8 x i32> %t) {
+entry:
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  store <8 x i32> %t, ptr %addr, align 4
+  ret void
+}
+
+; CHECK-LABEL: _setInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC at PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC at PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @_setInternalCNontemporalPair(<8 x i32> %t) {
+entry:
+  %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+  store <8 x i32> %t, ptr %addr, align 4, !nontemporal !0
+  ret void
+}
+
 ; Check that we catch AdrpAddLdr case when we have a simple chain:
 ; adrp -> ldr.
 ; CHECK-LABEL: _getInternalC
@@ -679,4 +830,6 @@ if.end.i:
 }
 declare void @callee(ptr nocapture readonly, ...)
 
+!0 = !{ i32 1 }
+
 attributes #0 = { "target-cpu"="cyclone" }

>From a342803d21ecace08113e0f644d738fa20800a3f Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellishoag at meta.com>
Date: Tue, 3 Jun 2025 11:15:54 -0700
Subject: [PATCH 2/2] add comment

---
 llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index ba0dc843d942d..5232116ad7090 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -230,6 +230,10 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
   case AArch64::STNPSi:
   case AArch64::STNPDi:
   case AArch64::STNPQi:
+    // We can only optimize the index operand.
+    // In case we have str xA, xB, [xA, #imm] or str xA, xB [xB, #imm], this is
+    // two different uses of xA or xB and we cannot fold, otherwise the xA or xB
+    // stored may be wrong, even if #imm == 0.
     return MO.getOperandNo() == 2 &&
            MI.getOperand(0).getReg() != MI.getOperand(2).getReg() &&
            MI.getOperand(1).getReg() != MI.getOperand(2).getReg();



More information about the llvm-commits mailing list