[llvm] [AArch64] Treat LD1 as check in getVectorInstrCostHelper on Apple cores. (PR #175970)

Wed Jan 14 07:52:23 PST 2026

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/175970

On Apple Silicon cores, single-element LD1 have should have the same performance characteristics as regular loads. Consider the insert for insertelement (load) as free

In terms of additional code-gen changes, this enables unrolling in more cases, due to the dynamic cost of vector loops with scalar loads + insert being estimated more accurately.

>From 8087f8d5a58a0714b07bf0da310f61fcde655e0a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 14 Jan 2026 14:39:42 +0000
Subject: [PATCH] [AArch64] Treat LD1 as check in getVectorInstrCostHelper on
 Apple cores.

On Apple Silicon cores, single-element LD1 have should have the same
performance characteristics as regular loads. Consider the insert
for insertelement (load) as free

In terms of additional code-gen changes, this enables unrolling in more
cases, due to the dynamic cost of vector loops with scalar loads +
insert being estimated more accurately.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  7 +-
 .../CostModel/AArch64/insert-extract.ll       | 65 ++++++++++++-------
 2 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5e890f6067e82..af97500bf6c00 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4024,11 +4024,14 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     // This is recognising a LD1 single-element structure to one lane of one
     // register instruction. I.e., if this is an `insertelement` instruction,
     // and its second operand is a load, then we will generate a LD1, which
-    // are expensive instructions.
-    if (I && dyn_cast<LoadInst>(I->getOperand(1)))
+    // are expensive instructions on some uArchs.
+    if (I && dyn_cast<LoadInst>(I->getOperand(1))) {
+      if (ST->isAppleMLike())
+        return 0;
       return CostKind == TTI::TCK_CodeSize
                  ? 0
                  : ST->getVectorInsertExtractBaseCost() + 1;
+    }
 
     // i1 inserts and extract will include an extra cset or cmp of the vector
     // value. Increase the cost by 1 to account.
diff --git a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
index cef6cf1a081e6..d932cf4082fe3 100644
--- a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll
@@ -1,10 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-n1 | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-n2 | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-v1 | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-v2 | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=kryo | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck --check-prefixes=CHECK,GENERIC %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-n1 | FileCheck --check-prefixes=CHECK,GENERIC %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-n2 | FileCheck --check-prefixes=CHECK,GENERIC %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-v1 | FileCheck --check-prefixes=CHECK,GENERIC %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=neoverse-v2 | FileCheck --check-prefixes=CHECK,GENERIC %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=kryo | FileCheck --check-prefixes=CHECK,GENERIC %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mcpu=apple-m4 | FileCheck --check-prefixes=CHECK,APPLE %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -84,10 +85,15 @@ define void @vectorInstrCost() {
 ;; LD1: Load one single-element structure to one lane of one register.
 
 define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) {
-; CHECK-LABEL: 'LD1_B'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %i, align 1
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %v2
+; GENERIC-LABEL: 'LD1_B'
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %i, align 1
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %v2
+;
+; APPLE-LABEL: 'LD1_B'
+; APPLE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %i, align 1
+; APPLE-NEXT:  Cost Model: Found costs of 0 for: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1
+; APPLE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %v2
 ;
 entry:
   %v1 = load i8, ptr %i, align 1
@@ -96,10 +102,15 @@ entry:
 }
 
 define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) {
-; CHECK-LABEL: 'LD1_H'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %i, align 2
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %v2
+; GENERIC-LABEL: 'LD1_H'
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %i, align 2
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %v2
+;
+; APPLE-LABEL: 'LD1_H'
+; APPLE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %i, align 2
+; APPLE-NEXT:  Cost Model: Found costs of 0 for: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2
+; APPLE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %v2
 ;
 entry:
   %v1 = load i16, ptr %i, align 2
@@ -108,10 +119,15 @@ entry:
 }
 
 define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) {
-; CHECK-LABEL: 'LD1_W'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %i, align 4
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %v2
+; GENERIC-LABEL: 'LD1_W'
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %i, align 4
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %v2
+;
+; APPLE-LABEL: 'LD1_W'
+; APPLE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %i, align 4
+; APPLE-NEXT:  Cost Model: Found costs of 0 for: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3
+; APPLE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %v2
 ;
 entry:
   %v1 = load i32, ptr %i, align 4
@@ -120,10 +136,15 @@ entry:
 }
 
 define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) {
-; CHECK-LABEL: 'LD1_X'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i64, ptr %i, align 8
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %v2
+; GENERIC-LABEL: 'LD1_X'
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i64, ptr %i, align 8
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:0 Lat:3 SizeLat:3 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
+; GENERIC-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %v2
+;
+; APPLE-LABEL: 'LD1_X'
+; APPLE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i64, ptr %i, align 8
+; APPLE-NEXT:  Cost Model: Found costs of 0 for: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0
+; APPLE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %v2
 ;
 entry:
   %v1 = load i64, ptr %i, align 8