[clang] [llvm] [AArch64][clang][llvm] Add ACLE `stshh` atomic store builtin (PR #181386)

Jonathan Thackray via cfe-commits cfe-commits at lists.llvm.org
Tue Mar 3 03:49:30 PST 2026


https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/181386

>From b7c9e0d18ac90beca6d08c3c5ec3cbc1d71b836c Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 13 Feb 2026 14:42:05 +0000
Subject: [PATCH 01/18] [AArch64][clang][llvm] Add ACLE `stshh` atomic store
 builtin

Add `__arm_atomic_store_with_stshh` implementation as defined
in the ACLE. Validate that the arguments passed are correct, and
lower it to the stshh intrinsic plus an atomic store with the
allowed orderings.

Gate this on FEAT_PCDPHINT so that availability matches
hardware support for the `STSHH` instruction. Use an i64
immediate and side-effect modeling to satisfy tablegen and decoding.
---
 clang/include/clang/Basic/BuiltinsAArch64.def |   3 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   9 ++
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |   7 +
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  48 ++++++
 clang/lib/Headers/arm_acle.h                  |   6 +
 clang/lib/Sema/SemaARM.cpp                    | 140 ++++++++++++++++++
 .../CodeGen/AArch64/pcdphint-atomic-store.c   |  31 ++++
 .../test/Sema/AArch64/pcdphint-atomic-store.c |  29 ++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   2 +
 .../lib/Target/AArch64/AArch64InstrFormats.td |  12 +-
 .../Disassembler/AArch64Disassembler.cpp      |  13 ++
 11 files changed, 298 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
 create mode 100644 clang/test/Sema/AArch64/pcdphint-atomic-store.c

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 5d7e956b73b87..5d747f4d9c4b2 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -135,6 +135,9 @@ TARGET_BUILTIN(__builtin_arm_st64b, "vv*WUiC*", "n", "ls64")
 TARGET_BUILTIN(__builtin_arm_st64bv, "WUiv*WUiC*", "n", "ls64")
 TARGET_BUILTIN(__builtin_arm_st64bv0, "WUiv*WUiC*", "n", "ls64")
 
+// Atomic store with PCDPHINT
+TARGET_BUILTIN(__builtin_arm_atomic_store_with_stshh, "v.", "t", "pcdphint")
+
 // Armv9.3-A Guarded Control Stack
 TARGET_BUILTIN(__builtin_arm_gcspopm, "WUiWUi", "n", "gcs")
 TARGET_BUILTIN(__builtin_arm_gcsss, "v*v*", "n", "gcs")
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a2d12a3a2c758..7f0de3a9701e9 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9552,6 +9552,15 @@ def err_atomic_builtin_must_be_pointer_intfltptr : Error<
 def err_atomic_builtin_pointer_size : Error<
   "address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte "
   "type (%0 invalid)">;
+def err_arm_atomic_store_with_stshh_bad_type : Error<
+  "address argument to '__arm_atomic_store_with_stshh' must be a pointer to an "
+  "8,16,32, or 64-bit integer type (%0 invalid)">;
+def err_arm_atomic_store_with_stshh_bad_value_type : Error<
+  "value argument to '__arm_atomic_store_with_stshh' must be an integer of the "
+  "same size as the pointed-to type (%0 invalid)">;
+def err_arm_atomic_store_with_stshh_bad_order : Error<
+  "memory order argument to '__arm_atomic_store_with_stshh' must be one of "
+  "__ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST">;
 def err_atomic_exclusive_builtin_pointer_size : Error<
   "address argument to load or store exclusive builtin must be a pointer to "
   // Because the range of legal sizes for load/store exclusive varies with the
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 5129aa75f8f8d..b1bf673735329 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1107,6 +1107,13 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  if (builtinID == clang::AArch64::BI__builtin_arm_atomic_store_with_stshh) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  }
+
   if (builtinID == clang::AArch64::BI__builtin_arm_rndr ||
       builtinID == clang::AArch64::BI__builtin_arm_rndrrs) {
     cgm.errorNYI(expr->getSourceRange(),
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 62920044405be..e009dea600fff 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5274,6 +5274,54 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, Args);
   }
 
+  if (BuiltinID == clang::AArch64::BI__builtin_arm_atomic_store_with_stshh) {
+    const Expr *Arg0 = E->getArg(0);
+    const Expr *Arg1 = E->getArg(1);
+    const Expr *Arg2 = E->getArg(2);
+    const Expr *Arg3 = E->getArg(3);
+
+    Value *StoreAddr = EmitScalarExpr(Arg0);
+    Value *StoreValue = EmitScalarExpr(Arg1);
+
+    llvm::APSInt OrderVal = Arg2->EvaluateKnownConstInt(getContext());
+    llvm::APSInt RetVal = Arg3->EvaluateKnownConstInt(getContext());
+
+    llvm::AtomicOrdering Ordering;
+    switch (OrderVal.getZExtValue()) {
+    case 0: // __ATOMIC_RELAXED
+      Ordering = llvm::AtomicOrdering::Monotonic;
+      break;
+    case 3: // __ATOMIC_RELEASE
+      Ordering = llvm::AtomicOrdering::Release;
+      break;
+    case 5: // __ATOMIC_SEQ_CST
+      Ordering = llvm::AtomicOrdering::SequentiallyConsistent;
+      break;
+    default:
+      llvm_unreachable(
+          "unexpected memory order for __arm_atomic_store_with_stshh");
+    }
+
+    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh);
+    llvm::Value *Arg = llvm::ConstantInt::get(Int64Ty, RetVal.getZExtValue());
+    CallInst *HintCall = Builder.CreateCall(F, Arg);
+
+    QualType ValQT = Arg0->IgnoreParenImpCasts()
+                         ->getType()
+                         ->castAs<PointerType>()
+                         ->getPointeeType();
+    llvm::Type *ValTy = ConvertType(ValQT);
+
+    CharUnits ValAlign = getContext().getTypeAlignInChars(ValQT);
+    Address Addr = Address(StoreAddr, ValTy, ValAlign);
+    LValue LVal = MakeAddrLValue(Addr, ValQT);
+
+    EmitAtomicStore(RValue::get(StoreValue), LVal, Ordering,
+                    /* isVolatile= */ false,
+                    /* isInit= */ false);
+    return HintCall;
+  }
+
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
       BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
 
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 9a6b6a837fa5a..ec06072bcc4bf 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -840,6 +840,12 @@ __rndrrs(uint64_t *__p) {
 }
 #endif
 
+/* Atomic store with PCDPHINT */
+#if defined(__ARM_FEATURE_PCDPHINT)
+#define __arm_atomic_store_with_stshh(ptr, data, memory_order, ret)            \
+  __builtin_arm_atomic_store_with_stshh((ptr), (data), (memory_order), (ret))
+#endif
+
 /* 11.2 Guarded Control Stack intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ void * __attribute__((__always_inline__, __nodebug__))
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 33edc455366a7..64486fe04717c 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1107,6 +1107,143 @@ bool SemaARM::CheckARMBuiltinFunctionCall(const TargetInfo &TI,
   }
 }
 
+static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
+                                                 CallExpr *TheCall) {
+  Sema &SemaRef = S.SemaRef;
+  ASTContext &Context = S.getASTContext();
+  DeclRefExpr *DRE =
+      cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
+  SourceLocation Loc = DRE->getBeginLoc();
+
+  // Ensure we have the proper number of arguments.
+  if (SemaRef.checkArgCount(TheCall, 4))
+    return true;
+
+  ExprResult PtrRes =
+      SemaRef.DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
+
+  // Bail if conversion failed.
+  if (PtrRes.isInvalid())
+    return true;
+
+  TheCall->setArg(0, PtrRes.get());
+  Expr *PointerArg = PtrRes.get();
+
+  // Check arg 0 is a pointer type, err out if not
+  const PointerType *PointerTy = PointerArg->getType()->getAs<PointerType>();
+  if (!PointerTy) {
+    SemaRef.Diag(Loc, diag::err_atomic_builtin_must_be_pointer)
+        << PointerArg->getType() << 0 << PointerArg->getSourceRange();
+    return true;
+  }
+
+  // Reject const-qualified pointee types, with an error
+  QualType ValType = PointerTy->getPointeeType();
+  if (ValType.isConstQualified()) {
+    SemaRef.Diag(Loc, diag::err_atomic_builtin_cannot_be_const)
+        << PointerArg->getType() << PointerArg->getSourceRange();
+    return true;
+  }
+
+  // Only integer element types are supported.
+  ValType = ValType.getUnqualifiedType();
+  if (!ValType->isIntegerType()) {
+    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_type)
+        << PointerArg->getType() << PointerArg->getSourceRange();
+    return true;
+  }
+
+  // Only 8/16/32/64-bit integers are supported.
+  unsigned Bits = Context.getTypeSize(ValType);
+  switch (Bits) {
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    break;
+  default:
+    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_type)
+        << PointerArg->getType() << PointerArg->getSourceRange();
+    return true;
+  }
+
+  ExprResult ValRes =
+      SemaRef.DefaultFunctionArrayLvalueConversion(TheCall->getArg(1));
+
+  // Bail if conversion failed.
+  if (ValRes.isInvalid())
+    return true;
+
+  // Check if value is an integer type.
+  Expr *ValArg = ValRes.get();
+  if (!ValArg->getType()->isIntegerType()) {
+    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
+        << ValArg->getType() << ValArg->getSourceRange();
+    return true;
+  }
+
+  // Value width must match the pointee width.
+  if (Context.getTypeSize(ValArg->getType()) != Bits) {
+    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
+        << ValArg->getType() << ValArg->getSourceRange();
+    return true;
+  }
+
+  // Prepare a cast if the value type differs
+  ExprResult ValArgRes;
+  CastKind CK =
+      ValArg->getType().getCanonicalType() == ValType.getCanonicalType()
+          ? CK_NoOp
+          : CK_IntegralCast;
+
+  // Apply cast to the pointee type.
+  ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK);
+
+  // Bail if cast failed.
+  if (ValArgRes.isInvalid())
+    return true;
+
+  TheCall->setArg(1, ValArgRes.get());
+  Expr *OrderArg = TheCall->getArg(2);
+
+  // Defer validation for dependent memory_order arguments.
+  if (OrderArg->isValueDependent())
+    return false;
+
+  // Require an order value.
+  std::optional<llvm::APSInt> OrderValOpt =
+      OrderArg->getIntegerConstantExpr(Context);
+  if (!OrderValOpt) {
+    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_order)
+        << OrderArg->getSourceRange();
+    return true;
+  }
+
+  // Validate order; not used here; used later in codegen.
+  llvm::APSInt OrderVal = *OrderValOpt;
+  int64_t Order = OrderVal.getSExtValue();
+  // __ATOMIC_RELAXED=0, __ATOMIC_RELEASE=3, __ATOMIC_SEQ_CST=5.
+  constexpr int64_t AtomicRelaxed = 0;
+  constexpr int64_t AtomicRelease = 3;
+  constexpr int64_t AtomicSeqCst = 5;
+  switch (Order) {
+  case AtomicRelaxed:
+  case AtomicRelease:
+  case AtomicSeqCst:
+    break;
+  default:
+    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_order)
+        << OrderArg->getSourceRange();
+    return true;
+  }
+
+  // Arg 3 (retention policy) must be between KEEP(0) and STRM(1).
+  if (SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 1))
+    return true;
+
+  return false;
+}
+
 bool SemaARM::CheckAArch64BuiltinFunctionCall(const TargetInfo &TI,
                                               unsigned BuiltinID,
                                               CallExpr *TheCall) {
@@ -1117,6 +1254,9 @@ bool SemaARM::CheckAArch64BuiltinFunctionCall(const TargetInfo &TI,
     return CheckARMBuiltinExclusiveCall(TI, BuiltinID, TheCall);
   }
 
+  if (BuiltinID == AArch64::BI__builtin_arm_atomic_store_with_stshh)
+    return CheckAArch64AtomicStoreWithStshhCall(*this, TheCall);
+
   if (BuiltinID == AArch64::BI__builtin_arm_prefetch) {
     return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 1) ||
            SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 3) ||
diff --git a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
new file mode 100644
index 0000000000000..79510be522b6a
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +pcdphint -D__ARM_FEATURE_PCDPHINT -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_acle.h>
+
+void test_u8(unsigned char *p, unsigned char v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+}
+// CHECK-LABEL: @test_u8
+// CHECK: call void @llvm.aarch64.stshh(i64 0)
+// CHECK-NEXT: store atomic i8 %{{.*}}, ptr %{{.*}} monotonic
+
+void test_u16(unsigned short *p, unsigned short v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELEASE, 1);
+}
+// CHECK-LABEL: @test_u16
+// CHECK: call void @llvm.aarch64.stshh(i64 1)
+// CHECK-NEXT: store atomic i16 %{{.*}}, ptr %{{.*}} release
+
+void test_u32(unsigned int *p, unsigned int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_SEQ_CST, 0);
+}
+// CHECK-LABEL: @test_u32
+// CHECK: call void @llvm.aarch64.stshh(i64 0)
+// CHECK-NEXT: store atomic i32 %{{.*}}, ptr %{{.*}} seq_cst
+
+void test_u64(unsigned long long *p, unsigned long long v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 1);
+}
+// CHECK-LABEL: @test_u64
+// CHECK: call void @llvm.aarch64.stshh(i64 1)
+// CHECK-NEXT: store atomic i64 %{{.*}}, ptr %{{.*}} monotonic
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
new file mode 100644
index 0000000000000..091f1c25c2880
--- /dev/null
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +pcdphint \
+// RUN:   -D__ARM_FEATURE_PCDPHINT -fsyntax-only -verify %s
+
+#include <arm_acle.h>
+
+void test_const_pointer(const unsigned int *p, unsigned int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error at -1 {{address argument to atomic builtin cannot be const-qualified}}
+}
+
+void test_non_integer_pointer(float *p, float v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error at -1 {{address argument to '__arm_atomic_store_with_stshh' must be a pointer to an 8,16,32, or 64-bit integer type}}
+}
+
+void test_invalid_bit_width(__int128 *p, __int128 v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error at -1 {{address argument to '__arm_atomic_store_with_stshh' must be a pointer to an 8,16,32, or 64-bit integer type}}
+}
+
+void test_invalid_memory_order(unsigned int *p, unsigned int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_ACQUIRE, 0);
+  // expected-error at -1 {{memory order argument to '__arm_atomic_store_with_stshh' must be one of __ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST}}
+}
+
+void test_invalid_retention_policy(unsigned int *p, unsigned int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 2);
+  // expected-error at -1 {{argument value 2 is outside the valid range [0, 1]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7f4b7383415c1..19ba3a5a740c5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -62,6 +62,8 @@ def int_aarch64_frint64x
 // HINT
 
 def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
+def int_aarch64_stshh
+    : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrHasSideEffects]>;
 
 def int_aarch64_break : Intrinsic<[], [llvm_i32_ty],
     [IntrNoMem, IntrHasSideEffects, IntrNoReturn, IntrCold, ImmArg<ArgIndex<0>>]>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 7d4e034ca16c8..69fb01ada0b40 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1840,16 +1840,24 @@ def PHintInstOperand : AsmOperandClass {
     let ParserMethod = "tryParsePHintInstOperand";
 }
 
-def phint_op : Operand<i32> {
+def phint_op : Operand<i64> {
     let ParserMatchClass = PHintInstOperand;
    let PrintMethod = "printPHintOp";
    let OperandType = "OPERAND_IMMEDIATE";
+   let MIOperandInfo = (ops i64imm);
+  let DecoderMethod = "DecodeUImm<3>";
 }
 
 class STSHHI
-    : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy", []>,
+    : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy",
+                    [(int_aarch64_stshh (i64 imm0_7:$policy))]>,
       Sched<[WriteHint]> {
   bits<3> policy;
+  // NOTE: ideally, this would have mayLoad = 0, mayStore = 0, but we cannot
+  // model patterns with sufficiently fine granularity.
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
   let Inst{20-12} = 0b000011001;
   let Inst{11-8} = 0b0110;
   let Inst{7-5} = policy;
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 4eb762a00d477..8fa1913ce24e5 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -38,6 +38,9 @@ using DecodeStatus = MCDisassembler::DecodeStatus;
 template <int Bits>
 static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
                                const MCDisassembler *Decoder);
+template <int Bits>
+static DecodeStatus DecodeUImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
+                               const MCDisassembler *Decoder);
 
 #define Success MCDisassembler::Success
 #define Fail MCDisassembler::Fail
@@ -1442,6 +1445,16 @@ static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
   return Success;
 }
 
+template <int Bits>
+static DecodeStatus DecodeUImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
+                               const MCDisassembler *Decoder) {
+  if (Imm & ~((1ULL << Bits) - 1))
+    return Fail;
+
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return Success;
+}
+
 // Decode 8-bit signed/unsigned immediate for a given element width.
 template <int ElementWidth>
 static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,

>From 6a1744fdfb1293543f2d266d239f9a2c7a80b92f Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 13 Feb 2026 17:19:25 +0000
Subject: [PATCH 02/18] fixup!

A few small tidyups
---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp        | 13 +++++++------
 clang/test/Sema/AArch64/pcdphint-atomic-store.c |  4 ++++
 llvm/lib/Target/AArch64/AArch64InstrFormats.td  |  8 ++++----
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index e009dea600fff..e121de7738482 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5284,7 +5284,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Value *StoreValue = EmitScalarExpr(Arg1);
 
     llvm::APSInt OrderVal = Arg2->EvaluateKnownConstInt(getContext());
-    llvm::APSInt RetVal = Arg3->EvaluateKnownConstInt(getContext());
+    llvm::APSInt RetentionPolicy = Arg3->EvaluateKnownConstInt(getContext());
 
     llvm::AtomicOrdering Ordering;
     switch (OrderVal.getZExtValue()) {
@@ -5302,10 +5302,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
           "unexpected memory order for __arm_atomic_store_with_stshh");
     }
 
-    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh);
-    llvm::Value *Arg = llvm::ConstantInt::get(Int64Ty, RetVal.getZExtValue());
-    CallInst *HintCall = Builder.CreateCall(F, Arg);
-
     QualType ValQT = Arg0->IgnoreParenImpCasts()
                          ->getType()
                          ->castAs<PointerType>()
@@ -5316,10 +5312,15 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Address Addr = Address(StoreAddr, ValTy, ValAlign);
     LValue LVal = MakeAddrLValue(Addr, ValQT);
 
+    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh);
+    llvm::Value *Arg =
+        llvm::ConstantInt::get(Int64Ty, RetentionPolicy.getZExtValue());
+    Builder.CreateCall(F, Arg);
+
     EmitAtomicStore(RValue::get(StoreValue), LVal, Ordering,
                     /* isVolatile= */ false,
                     /* isInit= */ false);
-    return HintCall;
+    return nullptr;
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index 091f1c25c2880..d9784656d486d 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -27,3 +27,7 @@ void test_invalid_retention_policy(unsigned int *p, unsigned int v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 2);
   // expected-error at -1 {{argument value 2 is outside the valid range [0, 1]}}
 }
+
+void test_signed_ok(int *p, int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 69fb01ada0b40..1390600488bf2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1842,10 +1842,10 @@ def PHintInstOperand : AsmOperandClass {
 
 def phint_op : Operand<i64> {
     let ParserMatchClass = PHintInstOperand;
-   let PrintMethod = "printPHintOp";
-   let OperandType = "OPERAND_IMMEDIATE";
-   let MIOperandInfo = (ops i64imm);
-  let DecoderMethod = "DecodeUImm<3>";
+    let PrintMethod = "printPHintOp";
+    let OperandType = "OPERAND_IMMEDIATE";
+    let MIOperandInfo = (ops i64imm);
+    let DecoderMethod = "DecodeUImm<3>";
 }
 
 class STSHHI

>From 9090921200bc3289212e585bf99af451f29cbeb9 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 13 Feb 2026 17:26:06 +0000
Subject: [PATCH 03/18] fixup!

More small issues tidied, and remove gating.
---
 clang/include/clang/Basic/BuiltinsAArch64.def      | 2 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp           | 4 ++--
 clang/lib/Headers/arm_acle.h                       | 2 --
 clang/lib/Sema/SemaARM.cpp                         | 2 +-
 clang/test/CodeGen/AArch64/pcdphint-atomic-store.c | 2 +-
 clang/test/Sema/AArch64/pcdphint-atomic-store.c    | 8 ++++++--
 6 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 5d747f4d9c4b2..5722b045f1ed1 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -136,7 +136,7 @@ TARGET_BUILTIN(__builtin_arm_st64bv, "WUiv*WUiC*", "n", "ls64")
 TARGET_BUILTIN(__builtin_arm_st64bv0, "WUiv*WUiC*", "n", "ls64")
 
 // Atomic store with PCDPHINT
-TARGET_BUILTIN(__builtin_arm_atomic_store_with_stshh, "v.", "t", "pcdphint")
+TARGET_BUILTIN(__builtin_arm_atomic_store_with_stshh, "v.", "t", "")
 
 // Armv9.3-A Guarded Control Stack
 TARGET_BUILTIN(__builtin_arm_gcspopm, "WUiWUi", "n", "gcs")
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index e121de7738482..c3bd965db9b44 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5315,12 +5315,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh);
     llvm::Value *Arg =
         llvm::ConstantInt::get(Int64Ty, RetentionPolicy.getZExtValue());
-    Builder.CreateCall(F, Arg);
+    CallInst *HintCall = Builder.CreateCall(F, Arg);
 
     EmitAtomicStore(RValue::get(StoreValue), LVal, Ordering,
                     /* isVolatile= */ false,
                     /* isInit= */ false);
-    return nullptr;
+    return HintCall;
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index ec06072bcc4bf..88ffd82912df1 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -841,10 +841,8 @@ __rndrrs(uint64_t *__p) {
 #endif
 
 /* Atomic store with PCDPHINT */
-#if defined(__ARM_FEATURE_PCDPHINT)
 #define __arm_atomic_store_with_stshh(ptr, data, memory_order, ret)            \
   __builtin_arm_atomic_store_with_stshh((ptr), (data), (memory_order), (ret))
-#endif
 
 /* 11.2 Guarded Control Stack intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 64486fe04717c..b09b5c0cf2f66 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1219,7 +1219,7 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
     return true;
   }
 
-  // Validate order; not used here; used later in codegen.
+  // Validate order here; the value is mapped to LLVM ordering in codegen.
   llvm::APSInt OrderVal = *OrderValOpt;
   int64_t Order = OrderVal.getSExtValue();
   // __ATOMIC_RELAXED=0, __ATOMIC_RELEASE=3, __ATOMIC_SEQ_CST=5.
diff --git a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
index 79510be522b6a..fceb739782641 100644
--- a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +pcdphint -D__ARM_FEATURE_PCDPHINT -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
 #include <arm_acle.h>
 
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index d9784656d486d..bd69ca859f15e 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +pcdphint \
-// RUN:   -D__ARM_FEATURE_PCDPHINT -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fsyntax-only -verify %s
 
 #include <arm_acle.h>
 
@@ -31,3 +30,8 @@ void test_invalid_retention_policy(unsigned int *p, unsigned int v) {
 void test_signed_ok(int *p, int v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
 }
+
+void test_value_size_mismatch(int *p, short v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer of the same size as the pointed-to type}}
+}

>From 4b27b1513f781bf3e3859b3f1fc301d44becd48d Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 13 Feb 2026 21:38:30 +0000
Subject: [PATCH 04/18] fixup! Improve error diagnostics, and other cleanups

---
 clang/include/clang/Basic/DiagnosticSemaKinds.td   |  2 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp           |  3 ++-
 clang/lib/Headers/arm_acle.h                       |  2 ++
 clang/lib/Sema/SemaARM.cpp                         |  6 ++++--
 clang/test/Sema/AArch64/pcdphint-atomic-store.c    |  2 +-
 llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll | 12 ++++++++++++
 6 files changed, 22 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 7f0de3a9701e9..46eebe301b087 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9557,7 +9557,7 @@ def err_arm_atomic_store_with_stshh_bad_type : Error<
   "8,16,32, or 64-bit integer type (%0 invalid)">;
 def err_arm_atomic_store_with_stshh_bad_value_type : Error<
   "value argument to '__arm_atomic_store_with_stshh' must be an integer of the "
-  "same size as the pointed-to type (%0 invalid)">;
+  "same size as the pointed-to type; expected %0 bits, got %1 bits">;
 def err_arm_atomic_store_with_stshh_bad_order : Error<
   "memory order argument to '__arm_atomic_store_with_stshh' must be one of "
   "__ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index c3bd965db9b44..8cc22c6885d6e 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5315,10 +5315,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh);
     llvm::Value *Arg =
         llvm::ConstantInt::get(Int64Ty, RetentionPolicy.getZExtValue());
+    // Execute hint before store to provide cache prefetch guidance.
     CallInst *HintCall = Builder.CreateCall(F, Arg);
 
     EmitAtomicStore(RValue::get(StoreValue), LVal, Ordering,
-                    /* isVolatile= */ false,
+                    /* isVolatile= */ LVal.isVolatile(),
                     /* isInit= */ false);
     return HintCall;
   }
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 88ffd82912df1..19a534d320790 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -841,8 +841,10 @@ __rndrrs(uint64_t *__p) {
 #endif
 
 /* Atomic store with PCDPHINT */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_atomic_store_with_stshh(ptr, data, memory_order, ret)            \
   __builtin_arm_atomic_store_with_stshh((ptr), (data), (memory_order), (ret))
+#endif
 
 /* 11.2 Guarded Control Stack intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index b09b5c0cf2f66..fd135c15e5c0c 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1178,14 +1178,16 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   Expr *ValArg = ValRes.get();
   if (!ValArg->getType()->isIntegerType()) {
     SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
-        << ValArg->getType() << ValArg->getSourceRange();
+        << Bits << Context.getTypeSize(ValArg->getType())
+        << ValArg->getSourceRange();
     return true;
   }
 
   // Value width must match the pointee width.
   if (Context.getTypeSize(ValArg->getType()) != Bits) {
     SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
-        << ValArg->getType() << ValArg->getSourceRange();
+        << Bits << Context.getTypeSize(ValArg->getType())
+        << ValArg->getSourceRange();
     return true;
   }
 
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index bd69ca859f15e..96efaff847b0c 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -33,5 +33,5 @@ void test_signed_ok(int *p, int v) {
 
 void test_value_size_mismatch(int *p, short v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer of the same size as the pointed-to type}}
+  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer of the same size as the pointed-to type; expected 32 bits, got 16 bits}}
 }
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
new file mode 100644
index 0000000000000..c424c0db6525f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64 -mattr=+pcdphint < %s | FileCheck %s
+
+declare void @llvm.aarch64.stshh(i64)
+
+define void @test_stshh_atomic_store(ptr %p, i32 %v) {
+; CHECK-LABEL: test_stshh_atomic_store
+; CHECK: stshh
+; CHECK: str
+  call void @llvm.aarch64.stshh(i64 0)
+  store atomic i32 %v, ptr %p monotonic, align 4
+  ret void
+}

>From 0c80ac99ae3c879f7a9428350a37f41a0f8bbf9f Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 16 Feb 2026 09:47:22 +0000
Subject: [PATCH 05/18] fixup! Fix Kerry's CR comments and add negative test
 for "must be an integer type"

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +++
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 10 +++------
 clang/lib/Headers/arm_acle.h                  |  2 +-
 clang/lib/Sema/SemaARM.cpp                    |  6 ++---
 .../test/Sema/AArch64/pcdphint-atomic-store.c |  5 +++++
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 22 ++++++++++++++-----
 6 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 46eebe301b087..f7306d3b65bd7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9558,6 +9558,9 @@ def err_arm_atomic_store_with_stshh_bad_type : Error<
 def err_arm_atomic_store_with_stshh_bad_value_type : Error<
   "value argument to '__arm_atomic_store_with_stshh' must be an integer of the "
   "same size as the pointed-to type; expected %0 bits, got %1 bits">;
+def err_arm_atomic_store_with_stshh_bad_value_must_be_integer : Error<
+  "value argument to '__arm_atomic_store_with_stshh' must be an integer type "
+  "(%0 invalid)">;
 def err_arm_atomic_store_with_stshh_bad_order : Error<
   "memory order argument to '__arm_atomic_store_with_stshh' must be one of "
   "__ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 8cc22c6885d6e..1d8ff348404a1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5276,15 +5276,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_atomic_store_with_stshh) {
     const Expr *Arg0 = E->getArg(0);
-    const Expr *Arg1 = E->getArg(1);
-    const Expr *Arg2 = E->getArg(2);
-    const Expr *Arg3 = E->getArg(3);
-
     Value *StoreAddr = EmitScalarExpr(Arg0);
-    Value *StoreValue = EmitScalarExpr(Arg1);
+    Value *StoreValue = EmitScalarExpr(E->getArg(1));
 
-    llvm::APSInt OrderVal = Arg2->EvaluateKnownConstInt(getContext());
-    llvm::APSInt RetentionPolicy = Arg3->EvaluateKnownConstInt(getContext());
+    llvm::APSInt OrderVal = E->getArg(2)->EvaluateKnownConstInt(getContext());
+    llvm::APSInt RetentionPolicy = E->getArg(3)->EvaluateKnownConstInt(getContext());
 
     llvm::AtomicOrdering Ordering;
     switch (OrderVal.getZExtValue()) {
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 19a534d320790..fc1af8f1d5a12 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -843,7 +843,7 @@ __rndrrs(uint64_t *__p) {
 /* Atomic store with PCDPHINT */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_atomic_store_with_stshh(ptr, data, memory_order, ret)            \
-  __builtin_arm_atomic_store_with_stshh((ptr), (data), (memory_order), (ret))
+  __builtin_arm_atomic_store_with_stshh(ptr, data, memory_order, ret)
 #endif
 
 /* 11.2 Guarded Control Stack intrinsics */
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index fd135c15e5c0c..ca844d647b52a 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1177,9 +1177,9 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // Check if value is an integer type.
   Expr *ValArg = ValRes.get();
   if (!ValArg->getType()->isIntegerType()) {
-    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
-        << Bits << Context.getTypeSize(ValArg->getType())
-        << ValArg->getSourceRange();
+    SemaRef.Diag(Loc,
+                 diag::err_arm_atomic_store_with_stshh_bad_value_must_be_integer)
+        << ValArg->getType() << ValArg->getSourceRange();
     return true;
   }
 
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index 96efaff847b0c..9ca2c0e8f9172 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -35,3 +35,8 @@ void test_value_size_mismatch(int *p, short v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer of the same size as the pointed-to type; expected 32 bits, got 16 bits}}
 }
+
+void test_non_integer_value(int *p, float v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer type ('float' invalid)}}
+}
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index c424c0db6525f..f6e6b1838fa5d 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -1,12 +1,22 @@
-; RUN: llc -mtriple=aarch64 -mattr=+pcdphint < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+v9.6a < %s | FileCheck %s
 
 declare void @llvm.aarch64.stshh(i64)
 
-define void @test_stshh_atomic_store(ptr %p, i32 %v) {
-; CHECK-LABEL: test_stshh_atomic_store
-; CHECK: stshh
-; CHECK: str
+define void @test_keep() {
+; CHECK-LABEL: test_keep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    ret
   call void @llvm.aarch64.stshh(i64 0)
-  store atomic i32 %v, ptr %p monotonic, align 4
+  ret void
+}
+
+define void @test_strm() {
+; CHECK-LABEL: test_strm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh(i64 1)
   ret void
 }

>From 982af60bf7d8a68d8f9a9dc900ae731bd0178f74 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 16 Feb 2026 22:40:36 +0000
Subject: [PATCH 06/18] fixup! Ensure stshh always immediately precedes a store
 instruction

---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 52 ++++++------
 clang/lib/Sema/SemaARM.cpp                    |  4 +-
 .../CodeGen/AArch64/pcdphint-atomic-store.c   | 63 +++++++++++---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  8 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 62 ++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 82 +++++++++++++++++++
 .../lib/Target/AArch64/AArch64InstrFormats.td | 11 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 15 ++++
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  |  6 +-
 9 files changed, 251 insertions(+), 52 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 1d8ff348404a1..2bb01c9450cb8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5278,46 +5278,40 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     const Expr *Arg0 = E->getArg(0);
     Value *StoreAddr = EmitScalarExpr(Arg0);
     Value *StoreValue = EmitScalarExpr(E->getArg(1));
+    Value *Order = EmitScalarExpr(E->getArg(2));
+    Value *Policy = EmitScalarExpr(E->getArg(3));
 
-    llvm::APSInt OrderVal = E->getArg(2)->EvaluateKnownConstInt(getContext());
-    llvm::APSInt RetentionPolicy = E->getArg(3)->EvaluateKnownConstInt(getContext());
+    auto *OrderC = dyn_cast<llvm::ConstantInt>(Order);
+    auto *PolicyC = dyn_cast<llvm::ConstantInt>(Policy);
 
-    llvm::AtomicOrdering Ordering;
-    switch (OrderVal.getZExtValue()) {
+    assert(OrderC && PolicyC &&
+           "order/policy must be constant for __arm_atomic_store_with_stshh");
+
+    // Validate ordering argument; bail out if invalid
+    switch (OrderC->getZExtValue()) {
     case 0: // __ATOMIC_RELAXED
-      Ordering = llvm::AtomicOrdering::Monotonic;
-      break;
     case 3: // __ATOMIC_RELEASE
-      Ordering = llvm::AtomicOrdering::Release;
-      break;
     case 5: // __ATOMIC_SEQ_CST
-      Ordering = llvm::AtomicOrdering::SequentiallyConsistent;
       break;
     default:
       llvm_unreachable(
           "unexpected memory order for __arm_atomic_store_with_stshh");
     }
 
-    QualType ValQT = Arg0->IgnoreParenImpCasts()
-                         ->getType()
-                         ->castAs<PointerType>()
-                         ->getPointeeType();
-    llvm::Type *ValTy = ConvertType(ValQT);
-
-    CharUnits ValAlign = getContext().getTypeAlignInChars(ValQT);
-    Address Addr = Address(StoreAddr, ValTy, ValAlign);
-    LValue LVal = MakeAddrLValue(Addr, ValQT);
-
-    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh);
-    llvm::Value *Arg =
-        llvm::ConstantInt::get(Int64Ty, RetentionPolicy.getZExtValue());
-    // Execute hint before store to provide cache prefetch guidance.
-    CallInst *HintCall = Builder.CreateCall(F, Arg);
-
-    EmitAtomicStore(RValue::get(StoreValue), LVal, Ordering,
-                    /* isVolatile= */ LVal.isVolatile(),
-                    /* isInit= */ false);
-    return HintCall;
+    llvm::Value *OrderArg =
+        llvm::ConstantInt::get(Int32Ty, OrderC->getZExtValue());
+    llvm::Value *PolicyArg =
+        llvm::ConstantInt::get(Int32Ty, PolicyC->getZExtValue());
+
+    llvm::Type *PtrTy = StoreAddr->getType();
+    llvm::Type *ValTy = StoreValue->getType();
+
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store, {PtrTy, ValTy});
+
+    // Emit a single intrinsic so backend can expand to STSHH followed by
+    // atomic store, to guarantee STSHH immediately precedes store insn.
+    return Builder.CreateCall(F, {StoreAddr, StoreValue, OrderArg, PolicyArg});
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index ca844d647b52a..90285ccda49f4 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1177,8 +1177,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // Check if value is an integer type.
   Expr *ValArg = ValRes.get();
   if (!ValArg->getType()->isIntegerType()) {
-    SemaRef.Diag(Loc,
-                 diag::err_arm_atomic_store_with_stshh_bad_value_must_be_integer)
+    SemaRef.Diag(
+        Loc, diag::err_arm_atomic_store_with_stshh_bad_value_must_be_integer)
         << ValArg->getType() << ValArg->getSourceRange();
     return true;
   }
diff --git a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
index fceb739782641..e87ef3253a6cc 100644
--- a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
@@ -1,31 +1,68 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
 #include <arm_acle.h>
 
+// CHECK-LABEL: define dso_local void @test_u8(
+// CHECK-SAME: ptr noundef [[P:%.*]], i8 noundef [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store i8 [[V]], ptr [[V_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[V_ADDR]], align 1
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr [[TMP0]], i8 [[TMP1]], i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
 void test_u8(unsigned char *p, unsigned char v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
 }
-// CHECK-LABEL: @test_u8
-// CHECK: call void @llvm.aarch64.stshh(i64 0)
-// CHECK-NEXT: store atomic i8 %{{.*}}, ptr %{{.*}} monotonic
 
+// CHECK-LABEL: define dso_local void @test_u16(
+// CHECK-SAME: ptr noundef [[P:%.*]], i16 noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store i16 [[V]], ptr [[V_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[V_ADDR]], align 2
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr [[TMP0]], i16 [[TMP1]], i32 3, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_u16(unsigned short *p, unsigned short v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELEASE, 1);
 }
-// CHECK-LABEL: @test_u16
-// CHECK: call void @llvm.aarch64.stshh(i64 1)
-// CHECK-NEXT: store atomic i16 %{{.*}}, ptr %{{.*}} release
 
+// CHECK-LABEL: define dso_local void @test_u32(
+// CHECK-SAME: ptr noundef [[P:%.*]], i32 noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr [[TMP0]], i32 [[TMP1]], i32 5, i32 0)
+// CHECK-NEXT:    ret void
+//
 void test_u32(unsigned int *p, unsigned int v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_SEQ_CST, 0);
 }
-// CHECK-LABEL: @test_u32
-// CHECK: call void @llvm.aarch64.stshh(i64 0)
-// CHECK-NEXT: store atomic i32 %{{.*}}, ptr %{{.*}} seq_cst
 
-void test_u64(unsigned long long *p, unsigned long long v) {
+// CHECK-LABEL: define dso_local void @test_u64(
+// CHECK-SAME: ptr noundef [[P:%.*]], i64 noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr [[TMP0]], i64 [[TMP1]], i32 0, i32 1)
+// CHECK-NEXT:    ret void
+//
+void test_u64(unsigned long *p, unsigned long v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 1);
 }
-// CHECK-LABEL: @test_u64
-// CHECK: call void @llvm.aarch64.stshh(i64 1)
-// CHECK-NEXT: store atomic i64 %{{.*}}, ptr %{{.*}} monotonic
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 19ba3a5a740c5..52531eebef42a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -63,7 +63,13 @@ def int_aarch64_frint64x
 
 def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 def int_aarch64_stshh
-    : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrHasSideEffects]>;
+    : DefaultAttrsIntrinsic<[], [llvm_i32_ty],
+                            [IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+def int_aarch64_stshh_atomic_store
+    : Intrinsic<[],
+                [llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty],
+                [IntrHasSideEffects, ImmArg<ArgIndex<2>>,
+                 ImmArg<ArgIndex<3>>]>;
 
 def int_aarch64_break : Intrinsic<[], [llvm_i32_ty],
     [IntrNoMem, IntrHasSideEffects, IntrNoReturn, IntrCold, ImmArg<ArgIndex<0>>]>;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 27d5940c808d2..3cbcd80c4c627 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
@@ -92,6 +93,8 @@ class AArch64ExpandPseudo : public MachineFunctionPass {
   bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI);
+  bool expandSTSHHAtomicStore(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI);
   struct ConditionalBlocks {
     MachineBasicBlock &CondBB;
     MachineBasicBlock &EndBB;
@@ -1001,6 +1004,60 @@ bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext(
   return true;
 }
 
+bool AArch64ExpandPseudo::expandSTSHHAtomicStore(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL(MI.getDebugLoc());
+
+  unsigned Order = MI.getOperand(2).getImm();
+  uint64_t Policy = MI.getOperand(3).getImm();
+
+  bool IsRelaxed = Order == 0;
+  unsigned StoreOpc = 0;
+
+  // __ATOMIC_RELAXED uses STR. __ATOMIC_{RELEASE/SEQ_CST} use STLR
+  switch (MI.getOpcode()) {
+  case AArch64::STSHH_ATOMIC_STORE_B:
+    StoreOpc = IsRelaxed ? AArch64::STRBBui : AArch64::STLRB;
+    break;
+  case AArch64::STSHH_ATOMIC_STORE_H:
+    StoreOpc = IsRelaxed ? AArch64::STRHHui : AArch64::STLRH;
+    break;
+  case AArch64::STSHH_ATOMIC_STORE_W:
+    StoreOpc = IsRelaxed ? AArch64::STRWui : AArch64::STLRW;
+    break;
+  case AArch64::STSHH_ATOMIC_STORE_X:
+    StoreOpc = IsRelaxed ? AArch64::STRXui : AArch64::STLRX;
+    break;
+  default:
+    llvm_unreachable("Unexpected STSHH atomic store pseudo");
+  }
+
+  // Emit the hint with the retention policy immediate.
+  MachineInstr *Hint = BuildMI(MBB, MBBI, DL, TII->get(AArch64::STSHH))
+                           .addImm(Policy)
+                           .getInstr();
+
+  // Emit the associated store instruction.
+  MachineInstrBuilder Store = BuildMI(MBB, MBBI, DL, TII->get(StoreOpc))
+                                  .add(MI.getOperand(0))
+                                  .add(MI.getOperand(1));
+
+  // Relaxed uses base+imm addressing with a zero offset.
+  if (IsRelaxed)
+    Store.addImm(0);
+
+  // Preserve memory operands and any implicit uses/defs.
+  Store->setMemRefs(*MBB.getParent(), MI.memoperands());
+  transferImpOps(MI, Store, Store);
+
+  // Bundle the hint and store so they remain adjacent.
+  finalizeBundle(MBB, Hint->getIterator(), std::next(Store->getIterator()));
+
+  MI.eraseFromParent();
+  return true;
+}
+
 AArch64ExpandPseudo::ConditionalBlocks
 AArch64ExpandPseudo::expandConditionalPseudo(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator MBBI,
@@ -1696,6 +1753,11 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandCALL_BTI(MBB, MBBI);
    case AArch64::StoreSwiftAsyncContext:
      return expandStoreSwiftAsyncContext(MBB, MBBI);
+   case AArch64::STSHH_ATOMIC_STORE_B:
+   case AArch64::STSHH_ATOMIC_STORE_H:
+   case AArch64::STSHH_ATOMIC_STORE_W:
+   case AArch64::STSHH_ATOMIC_STORE_X:
+     return expandSTSHHAtomicStore(MBB, MBBI);
    case AArch64::RestoreZAPseudo:
    case AArch64::CommitZASavePseudo:
    case AArch64::MSRpstatePseudo: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ebe9a6b1dfe61..b12db3153125d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6366,6 +6366,88 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                        Op.getOperand(0),                        // Chain
                        DAG.getTargetConstant(24, DL, MVT::i32), // Rt
                        Op.getOperand(2));                       // Addr
+  case Intrinsic::aarch64_stshh: {
+    SDValue Chain = Op.getOperand(0);
+    auto *PolicyC = cast<ConstantSDNode>(Op.getOperand(2));
+    SDValue Policy =
+        DAG.getTargetConstant(PolicyC->getZExtValue(), DL, MVT::i32);
+    SDValue Ops[] = {Policy, Chain};
+    MachineSDNode *N = DAG.getMachineNode(AArch64::STSHH, DL, MVT::Other, Ops);
+    return SDValue(N, 0);
+  }
+  case Intrinsic::aarch64_stshh_atomic_store: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue Ptr = Op.getOperand(2);
+    SDValue Val = Op.getOperand(3);
+    auto *OrderC = cast<ConstantSDNode>(Op.getOperand(4));
+    auto *PolicyC = cast<ConstantSDNode>(Op.getOperand(5));
+    uint64_t OrderVal = OrderC->getZExtValue();
+
+    unsigned SizeBits = Val.getValueType().getSizeInBits();
+    if (SizeBits < 8)
+      SizeBits = 8;
+    unsigned PseudoOpc = 0;
+    // Select pseudo opcode based on value size.
+    switch (SizeBits) {
+    case 8:
+      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_B;
+      break;
+    case 16:
+      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_H;
+      break;
+    case 32:
+      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_W;
+      break;
+    case 64:
+      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_X;
+      break;
+    default:
+      llvm_unreachable("Unexpected STSHH atomic store size");
+    }
+
+    // Extend or truncate value to expected store width
+    if (SizeBits <= 32)
+      Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i32);
+    else
+      Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
+
+    SDValue Order = DAG.getTargetConstant(OrderVal, DL, MVT::i32);
+    SDValue Policy =
+        DAG.getTargetConstant(PolicyC->getZExtValue(), DL, MVT::i32);
+
+    // Build pseudo which expands to STSHH + atomic store.
+    SDValue Ops[] = {Val, Ptr, Order, Policy, Chain};
+    MachineSDNode *N = DAG.getMachineNode(PseudoOpc, DL, MVT::Other, Ops);
+
+    // Select correct memory ordering for the store
+    AtomicOrdering Ordering;
+    switch (OrderVal) {
+    case 0: // __ATOMIC_RELAXED
+      Ordering = AtomicOrdering::Monotonic;
+      break;
+    case 3: // __ATOMIC_RELEASE
+      Ordering = AtomicOrdering::Release;
+      break;
+    case 5: // __ATOMIC_SEQ_CST
+      Ordering = AtomicOrdering::SequentiallyConsistent;
+      break;
+    default:
+      llvm_unreachable("Unexpected memory order for STSHH atomic store");
+    }
+
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT MemVT = EVT::getIntegerVT(Ctx, SizeBits);
+    Type *MemTy = MemVT.getTypeForEVT(Ctx);
+    Align Alignment = DAG.getDataLayout().getABITypeAlign(MemTy);
+    uint64_t Size = MemVT.getStoreSize();
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo(), MachineMemOperand::MOStore, Size, Alignment,
+        AAMDNodes(), nullptr, SyncScope::System, Ordering);
+
+    DAG.setNodeMemRefs(N, {MMO});
+    return SDValue(N, 0);
+  }
   case Intrinsic::aarch64_sme_str:
   case Intrinsic::aarch64_sme_ldr: {
     return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1390600488bf2..5b2071b2c0b11 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1159,6 +1159,11 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_7Operand;
 }
 
+// imm0_7_i32 predicate - True if the immediate is in the range [0,7]
+def imm0_7_i32 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 8;
+}]>;
+
 // imm0_3 predicate - True if the immediate is in the range [0,3]
 def imm0_3 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 4;
@@ -1840,17 +1845,17 @@ def PHintInstOperand : AsmOperandClass {
     let ParserMethod = "tryParsePHintInstOperand";
 }
 
-def phint_op : Operand<i64> {
+def phint_op : Operand<i32> {
     let ParserMatchClass = PHintInstOperand;
     let PrintMethod = "printPHintOp";
     let OperandType = "OPERAND_IMMEDIATE";
-    let MIOperandInfo = (ops i64imm);
+    let MIOperandInfo = (ops i32imm);
     let DecoderMethod = "DecodeUImm<3>";
 }
 
 class STSHHI
     : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy",
-                    [(int_aarch64_stshh (i64 imm0_7:$policy))]>,
+                    [(int_aarch64_stshh (i32 imm0_7_i32:$policy))]>,
       Sched<[WriteHint]> {
   bits<3> policy;
   // NOTE: ideally, this would have mayLoad = 0, mayStore = 0, but we cannot
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 21c0b0502d0d1..c0457dc529d3d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1577,6 +1577,21 @@ def : InstAlias<"nop", (NOP)>;
 
 def STSHH: STSHHI;
 
+let hasSideEffects = 1, mayStore = 1, isPseudo = 1, isCodeGenOnly = 1 in {
+def STSHH_ATOMIC_STORE_B
+    : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
+                          i64imm:$policy), []>, Sched<[]>;
+def STSHH_ATOMIC_STORE_H
+    : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
+                          i64imm:$policy), []>, Sched<[]>;
+def STSHH_ATOMIC_STORE_W
+    : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
+                          i64imm:$policy), []>, Sched<[]>;
+def STSHH_ATOMIC_STORE_X
+    : Pseudo<(outs), (ins GPR64:$val, GPR64sp:$addr, i32imm:$order,
+                          i64imm:$policy), []>, Sched<[]>;
+}
+
 // In order to be able to write readable assembly, LLVM should accept assembly
 // inputs that use Branch Target Identification mnemonics, even with BTI disabled.
 // However, in order to be compatible with other assemblers (e.g. GAS), LLVM
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index f6e6b1838fa5d..06affdf5ff650 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -1,14 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a < %s | FileCheck %s
 
-declare void @llvm.aarch64.stshh(i64)
-
 define void @test_keep() {
 ; CHECK-LABEL: test_keep:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stshh keep
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh(i64 0)
+  call void @llvm.aarch64.stshh(i32 0)
   ret void
 }
 
@@ -17,6 +15,6 @@ define void @test_strm() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stshh strm
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh(i64 1)
+  call void @llvm.aarch64.stshh(i32 1)
   ret void
 }

>From 70993b142fe97c5415a69258ef34e43b3bc02a16 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Feb 2026 21:14:32 +0000
Subject: [PATCH 07/18] fixup! remove mayLoad/mayStore as suggested by Kerry

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td      | 2 +-
 llvm/lib/Target/AArch64/AArch64InstrFormats.td | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 52531eebef42a..f6abbb81b5071 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -64,7 +64,7 @@ def int_aarch64_frint64x
 def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 def int_aarch64_stshh
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty],
-                            [IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+                            [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
 def int_aarch64_stshh_atomic_store
     : Intrinsic<[],
                 [llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty],
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 5b2071b2c0b11..e3079d4d9d1c2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1858,11 +1858,6 @@ class STSHHI
                     [(int_aarch64_stshh (i32 imm0_7_i32:$policy))]>,
       Sched<[WriteHint]> {
   bits<3> policy;
-  // NOTE: ideally, this would have mayLoad = 0, mayStore = 0, but we cannot
-  // model patterns with sufficiently fine granularity.
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
   let Inst{20-12} = 0b000011001;
   let Inst{11-8} = 0b0110;
   let Inst{7-5} = policy;

>From 4b07a1eb1315c5034db4aef7dbaf5821d4496c5d Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 24 Feb 2026 12:05:09 +0000
Subject: [PATCH 08/18] fixup! Fix issues Kerry raised in PR

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  6 +---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 17 +++-------
 clang/lib/Sema/SemaARM.cpp                    | 33 ++++++-------------
 .../test/Sema/AArch64/pcdphint-atomic-store.c | 25 ++++++++------
 4 files changed, 31 insertions(+), 50 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f7306d3b65bd7..19adf5d75134f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9556,11 +9556,7 @@ def err_arm_atomic_store_with_stshh_bad_type : Error<
   "address argument to '__arm_atomic_store_with_stshh' must be a pointer to an "
   "8,16,32, or 64-bit integer type (%0 invalid)">;
 def err_arm_atomic_store_with_stshh_bad_value_type : Error<
-  "value argument to '__arm_atomic_store_with_stshh' must be an integer of the "
-  "same size as the pointed-to type; expected %0 bits, got %1 bits">;
-def err_arm_atomic_store_with_stshh_bad_value_must_be_integer : Error<
-  "value argument to '__arm_atomic_store_with_stshh' must be an integer type "
-  "(%0 invalid)">;
+  "value argument to '__arm_atomic_store_with_stshh' must be %0; got %1">;
 def err_arm_atomic_store_with_stshh_bad_order : Error<
   "memory order argument to '__arm_atomic_store_with_stshh' must be one of "
   "__ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 2bb01c9450cb8..ba45213ed8672 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5275,8 +5275,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_atomic_store_with_stshh) {
-    const Expr *Arg0 = E->getArg(0);
-    Value *StoreAddr = EmitScalarExpr(Arg0);
+    Value *StoreAddr = EmitScalarExpr(E->getArg(0));
     Value *StoreValue = EmitScalarExpr(E->getArg(1));
     Value *Order = EmitScalarExpr(E->getArg(2));
     Value *Policy = EmitScalarExpr(E->getArg(3));
@@ -5298,20 +5297,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
           "unexpected memory order for __arm_atomic_store_with_stshh");
     }
 
-    llvm::Value *OrderArg =
-        llvm::ConstantInt::get(Int32Ty, OrderC->getZExtValue());
-    llvm::Value *PolicyArg =
-        llvm::ConstantInt::get(Int32Ty, PolicyC->getZExtValue());
-
-    llvm::Type *PtrTy = StoreAddr->getType();
-    llvm::Type *ValTy = StoreValue->getType();
-
     Function *F =
-        CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store, {PtrTy, ValTy});
+        CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
+              {StoreAddr->getType(), StoreValue->getType()});
+
 
     // Emit a single intrinsic so backend can expand to STSHH followed by
     // atomic store, to guarantee STSHH immediately precedes store insn.
-    return Builder.CreateCall(F, {StoreAddr, StoreValue, OrderArg, PolicyArg});
+    return Builder.CreateCall(F, {StoreAddr, StoreValue, OrderC, PolicyC});
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 90285ccda49f4..561a8eebe4055 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1174,20 +1174,13 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   if (ValRes.isInvalid())
     return true;
 
-  // Check if value is an integer type.
   Expr *ValArg = ValRes.get();
-  if (!ValArg->getType()->isIntegerType()) {
-    SemaRef.Diag(
-        Loc, diag::err_arm_atomic_store_with_stshh_bad_value_must_be_integer)
-        << ValArg->getType() << ValArg->getSourceRange();
-    return true;
-  }
+  QualType ValArgType = ValArg->getType().getUnqualifiedType();
 
-  // Value width must match the pointee width.
-  if (Context.getTypeSize(ValArg->getType()) != Bits) {
+  // Check value type and width
+  if (!Context.hasSameType(ValArgType, ValType)) {
     SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
-        << Bits << Context.getTypeSize(ValArg->getType())
-        << ValArg->getSourceRange();
+        << ValType << ValArg->getType() << ValArg->getSourceRange();
     return true;
   }
 
@@ -1221,19 +1214,13 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
     return true;
   }
 
-  // Validate order here; the value is mapped to LLVM ordering in codegen.
-  llvm::APSInt OrderVal = *OrderValOpt;
-  int64_t Order = OrderVal.getSExtValue();
+  llvm::APSInt OrderVal;
+  if (SemaRef.BuiltinConstantArg(TheCall, 2, OrderVal))
+    return true;
+
   // __ATOMIC_RELAXED=0, __ATOMIC_RELEASE=3, __ATOMIC_SEQ_CST=5.
-  constexpr int64_t AtomicRelaxed = 0;
-  constexpr int64_t AtomicRelease = 3;
-  constexpr int64_t AtomicSeqCst = 5;
-  switch (Order) {
-  case AtomicRelaxed:
-  case AtomicRelease:
-  case AtomicSeqCst:
-    break;
-  default:
+  int64_t Order = OrderVal.getSExtValue();
+  if (Order != 0 && Order != 3 && Order != 5) {
     SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_order)
         << OrderArg->getSourceRange();
     return true;
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index 9ca2c0e8f9172..fee7bc9e008b1 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -2,6 +2,15 @@
 
 #include <arm_acle.h>
 
+void test_signed_ok(int *p, int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+}
+
+void test_invalid_retention_policy(unsigned int *p, unsigned int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 2);
+  // expected-error at -1 {{argument value 2 is outside the valid range [0, 1]}}
+}
+
 void test_const_pointer(const unsigned int *p, unsigned int v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{address argument to atomic builtin cannot be const-qualified}}
@@ -22,21 +31,17 @@ void test_invalid_memory_order(unsigned int *p, unsigned int v) {
   // expected-error at -1 {{memory order argument to '__arm_atomic_store_with_stshh' must be one of __ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST}}
 }
 
-void test_invalid_retention_policy(unsigned int *p, unsigned int v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 2);
-  // expected-error at -1 {{argument value 2 is outside the valid range [0, 1]}}
-}
-
-void test_signed_ok(int *p, int v) {
+void test_value_size_mismatch(int *p, short v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got 'short'}}
 }
 
-void test_value_size_mismatch(int *p, short v) {
+void test_non_integer_value(int *p, float v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer of the same size as the pointed-to type; expected 32 bits, got 16 bits}}
+  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got 'float'}}
 }
 
-void test_non_integer_value(int *p, float v) {
+void test_value_i128_mismatch(int *p, __int128 v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be an integer type ('float' invalid)}}
+  // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got '__int128'}}
 }

>From e9692c2a7c4113a732f6b703d412e7f6808dce6a Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 24 Feb 2026 14:02:09 +0000
Subject: [PATCH 09/18] fixup! Fix more PR comments

---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  3 +-
 clang/test/CodeGen/arm_acle.c                 | 10 +++++++
 clang/test/CodeGen/builtins-arm64.c           |  5 ++++
 .../test/Sema/AArch64/pcdphint-atomic-store.c | 28 +++++++++++++------
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  4 ---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  9 ------
 .../lib/Target/AArch64/AArch64InstrFormats.td |  3 +-
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 14 ++++++----
 8 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index ba45213ed8672..310465aab588e 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5299,8 +5299,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
     Function *F =
         CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
-              {StoreAddr->getType(), StoreValue->getType()});
-
+                         {StoreAddr->getType(), StoreValue->getType()});
 
     // Emit a single intrinsic so backend can expand to STSHH followed by
     // atomic store, to guarantee STSHH immediately precedes store insn.
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index b053778581134..d902eb08a9cc7 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -1822,4 +1822,14 @@ int test_rndrrs(uint64_t *__addr) {
 }
 #endif
 
+#if defined(__ARM_64BIT_STATE)
 
+// AArch64-LABEL: @test_stshh_atomic_store(
+// AArch64-NEXT:  entry:
+// AArch64-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 0)
+// AArch64-NEXT:    ret void
+//
+void test_stshh_atomic_store(int *p, int v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+}
+#endif
diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c
index 3d054c79f1777..6a1cb845e41fa 100644
--- a/clang/test/CodeGen/builtins-arm64.c
+++ b/clang/test/CodeGen/builtins-arm64.c
@@ -39,6 +39,11 @@ void hints(void) {
   __builtin_arm_sevl();   //CHECK: call {{.*}} @llvm.aarch64.hint(i32 5)
 }
 
+void stshh_atomic_store(int *p, int v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // CHECK: call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr {{.*}}, i32 {{.*}}, i32 0, i32 0)
+}
+
 void barriers(void) {
   __builtin_arm_dmb(1);  //CHECK: call {{.*}} @llvm.aarch64.dmb(i32 1)
   __builtin_arm_dsb(2);  //CHECK: call {{.*}} @llvm.aarch64.dsb(i32 2)
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index fee7bc9e008b1..e3314b84c7a1c 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -3,45 +3,55 @@
 #include <arm_acle.h>
 
 void test_signed_ok(int *p, int v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
 }
 
 void test_invalid_retention_policy(unsigned int *p, unsigned int v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 2);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 2);
   // expected-error at -1 {{argument value 2 is outside the valid range [0, 1]}}
 }
 
 void test_const_pointer(const unsigned int *p, unsigned int v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{address argument to atomic builtin cannot be const-qualified}}
 }
 
 void test_non_integer_pointer(float *p, float v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{address argument to '__arm_atomic_store_with_stshh' must be a pointer to an 8,16,32, or 64-bit integer type}}
 }
 
 void test_invalid_bit_width(__int128 *p, __int128 v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{address argument to '__arm_atomic_store_with_stshh' must be a pointer to an 8,16,32, or 64-bit integer type}}
 }
 
 void test_invalid_memory_order(unsigned int *p, unsigned int v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_ACQUIRE, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_ACQUIRE, 0);
   // expected-error at -1 {{memory order argument to '__arm_atomic_store_with_stshh' must be one of __ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST}}
 }
 
 void test_value_size_mismatch(int *p, short v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got 'short'}}
 }
 
 void test_non_integer_value(int *p, float v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got 'float'}}
 }
 
+void test_too_few_args(int *p, int v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED);
+  // expected-error at -1 {{too few arguments to function call, expected 4, have 3}}
+}
+
+void test_too_many_args(int *p, int v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0, 1);
+  // expected-error at -1 {{too many arguments to function call, expected 4, have 5}}
+}
+
 void test_value_i128_mismatch(int *p, __int128 v) {
-  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got '__int128'}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index f6abbb81b5071..a8ec48da7a8b4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -62,9 +62,6 @@ def int_aarch64_frint64x
 // HINT
 
 def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
-def int_aarch64_stshh
-    : DefaultAttrsIntrinsic<[], [llvm_i32_ty],
-                            [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
 def int_aarch64_stshh_atomic_store
     : Intrinsic<[],
                 [llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4295,4 +4292,3 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
       [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
 }
-
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b12db3153125d..0a754678649cf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6366,15 +6366,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                        Op.getOperand(0),                        // Chain
                        DAG.getTargetConstant(24, DL, MVT::i32), // Rt
                        Op.getOperand(2));                       // Addr
-  case Intrinsic::aarch64_stshh: {
-    SDValue Chain = Op.getOperand(0);
-    auto *PolicyC = cast<ConstantSDNode>(Op.getOperand(2));
-    SDValue Policy =
-        DAG.getTargetConstant(PolicyC->getZExtValue(), DL, MVT::i32);
-    SDValue Ops[] = {Policy, Chain};
-    MachineSDNode *N = DAG.getMachineNode(AArch64::STSHH, DL, MVT::Other, Ops);
-    return SDValue(N, 0);
-  }
   case Intrinsic::aarch64_stshh_atomic_store: {
     SDValue Chain = Op.getOperand(0);
     SDValue Ptr = Op.getOperand(2);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index e3079d4d9d1c2..322522741e0d9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1854,8 +1854,7 @@ def phint_op : Operand<i32> {
 }
 
 class STSHHI
-    : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy",
-                    [(int_aarch64_stshh (i32 imm0_7_i32:$policy))]>,
+    : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy">,
       Sched<[WriteHint]> {
   bits<3> policy;
   let Inst{20-12} = 0b000011001;
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index 06affdf5ff650..60bbbbbcd7058 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -1,20 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a < %s | FileCheck %s
 
-define void @test_keep() {
-; CHECK-LABEL: test_keep:
+define void @test_keep_relaxed(ptr %p, i32 %v) {
+; CHECK-LABEL: test_keep_relaxed:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    str w1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh(i32 0)
+  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 0)
   ret void
 }
 
-define void @test_strm() {
-; CHECK-LABEL: test_strm:
+define void @test_strm_release(ptr %p, i32 %v) {
+; CHECK-LABEL: test_strm_release:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlr w1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh(i32 1)
+  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 3, i32 1)
   ret void
 }

>From e43f8bb5eabcb1e63e0fc6e7e6a18ba0b2e87e52 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 24 Feb 2026 20:46:29 +0000
Subject: [PATCH 10/18] fixup! Address more helpful review comments from Kerry

---
 clang/lib/Sema/SemaARM.cpp                    |   5 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |   5 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   8 +-
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 160 ++++++++++++++++++
 4 files changed, 165 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 561a8eebe4055..1d22abff33d48 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1227,10 +1227,7 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   }
 
   // Arg 3 (retention policy) must be between KEEP(0) and STRM(1).
-  if (SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 1))
-    return true;
-
-  return false;
+  return SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 1);
 }
 
 bool SemaARM::CheckAArch64BuiltinFunctionCall(const TargetInfo &TI,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 322522741e0d9..b9ccc41e4432b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1159,11 +1159,6 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_7Operand;
 }
 
-// imm0_7_i32 predicate - True if the immediate is in the range [0,7]
-def imm0_7_i32 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 8;
-}]>;
-
 // imm0_3 predicate - True if the immediate is in the range [0,3]
 def imm0_3 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 4;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c0457dc529d3d..c2685fcb903ca 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1580,16 +1580,16 @@ def STSHH: STSHHI;
 let hasSideEffects = 1, mayStore = 1, isPseudo = 1, isCodeGenOnly = 1 in {
 def STSHH_ATOMIC_STORE_B
     : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
-                          i64imm:$policy), []>, Sched<[]>;
+                          i32imm:$policy), []>, Sched<[]>;
 def STSHH_ATOMIC_STORE_H
     : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
-                          i64imm:$policy), []>, Sched<[]>;
+                          i32imm:$policy), []>, Sched<[]>;
 def STSHH_ATOMIC_STORE_W
     : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
-                          i64imm:$policy), []>, Sched<[]>;
+                          i32imm:$policy), []>, Sched<[]>;
 def STSHH_ATOMIC_STORE_X
     : Pseudo<(outs), (ins GPR64:$val, GPR64sp:$addr, i32imm:$order,
-                          i64imm:$policy), []>, Sched<[]>;
+                          i32imm:$policy), []>, Sched<[]>;
 }
 
 // In order to be able to write readable assembly, LLVM should accept assembly
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index 60bbbbbcd7058..67b89112f91a7 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -20,3 +20,163 @@ define void @test_strm_release(ptr %p, i32 %v) {
   call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 3, i32 1)
   ret void
 }
+
+define void @test_keep_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_keep_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    strb w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 0, i32 0)
+  ret void
+}
+
+define void @test_keep_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_keep_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    strh w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 0, i32 0)
+  ret void
+}
+
+define void @test_keep_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_keep_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    str w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 0)
+  ret void
+}
+
+define void @test_keep_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_keep_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    str x1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 0, i32 0)
+  ret void
+}
+
+define void @test_strm_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_strm_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    strb w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 0, i32 1)
+  ret void
+}
+
+define void @test_strm_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_strm_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    strh w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 0, i32 1)
+  ret void
+}
+
+define void @test_strm_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_strm_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    str w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 1)
+  ret void
+}
+
+define void @test_strm_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_strm_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    str x1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 0, i32 1)
+  ret void
+}
+
+define void @test_strm_release_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_strm_release_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlrb w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 3, i32 1)
+  ret void
+}
+
+define void @test_strm_release_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_strm_release_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlrh w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 3, i32 1)
+  ret void
+}
+
+define void @test_strm_release_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_strm_release_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlr w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 3, i32 1)
+  ret void
+}
+
+define void @test_strm_release_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_strm_release_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlr x1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 3, i32 1)
+  ret void
+}
+
+define void @test_strm_seqcst_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_strm_seqcst_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlrb w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 5, i32 1)
+  ret void
+}
+
+define void @test_strm_seqcst_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_strm_seqcst_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlrh w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 5, i32 1)
+  ret void
+}
+
+define void @test_strm_seqcst_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_strm_seqcst_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlr w1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 5, i32 1)
+  ret void
+}
+
+define void @test_strm_seqcst_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_strm_seqcst_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stlr x1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 5, i32 1)
+  ret void
+}

>From 55bcb7c55a67a21754246bcb914d576b635f7d1a Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 24 Feb 2026 23:57:51 +0000
Subject: [PATCH 11/18] fixup! Fix tests

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0a754678649cf..2e3674d86afbe 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2121,6 +2121,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
   }
 
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   if (Subtarget->hasSVE()) {

>From ca907524f6719336de1bea7f6707f540857356f9 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 25 Feb 2026 01:09:21 +0000
Subject: [PATCH 12/18] fixup! Move code to `AArch64ExpandPseudoInsts` and
 `getTgtMemIntrinsic`

Move code to `AArch64ExpandPseudoInsts` and `getTgtMemIntrinsic`
and use tablegen pattern for intrinsic, plus other small review changes.
---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  22 +++-
 clang/lib/Sema/SemaARM.cpp                    |  89 ++++++-------
 .../CodeGen/AArch64/pcdphint-atomic-store.c   |  11 +-
 clang/test/CodeGen/arm_acle.c                 |   2 +-
 clang/test/CodeGen/builtins-arm64.c           |   2 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   6 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  33 +++--
 .../Target/AArch64/AArch64ISelLowering.cpp    | 122 +++++++-----------
 .../lib/Target/AArch64/AArch64InstrFormats.td |   2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  22 ++--
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 101 ++++++++++-----
 11 files changed, 220 insertions(+), 192 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 310465aab588e..3395d322b434d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5297,13 +5297,25 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
           "unexpected memory order for __arm_atomic_store_with_stshh");
     }
 
-    Function *F =
-        CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
-                         {StoreAddr->getType(), StoreValue->getType()});
+    // Compute pointee bit-width from arg0 and create as i32 constant
+    QualType ValQT =
+        E->getArg(0)->getType()->castAs<PointerType>()->getPointeeType();
+    unsigned SizeBits = getContext().getTypeSize(ValQT);
+    auto *SizeC = llvm::ConstantInt::get(Int32Ty, SizeBits);
+
+    Value *StoreValue64 = Builder.CreateZExtOrTrunc(StoreValue, Int64Ty);
+
+    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
+                                   {StoreAddr->getType()});
+
+    // Intrinsic imm args are i32 regardless of source integer width
+    auto *OrderI32 = llvm::ConstantInt::get(Int32Ty, OrderC->getZExtValue());
+    auto *PolicyI32 = llvm::ConstantInt::get(Int32Ty, PolicyC->getZExtValue());
 
     // Emit a single intrinsic so backend can expand to STSHH followed by
-    // atomic store, to guarantee STSHH immediately precedes store insn.
-    return Builder.CreateCall(F, {StoreAddr, StoreValue, OrderC, PolicyC});
+    // atomic store, to guarantee STSHH immediately precedes STR insn
+    return Builder.CreateCall(
+        F, {StoreAddr, StoreValue64, OrderI32, PolicyI32, SizeC});
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 1d22abff33d48..937432d21f3d5 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1119,61 +1119,61 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   if (SemaRef.checkArgCount(TheCall, 4))
     return true;
 
+  // Normalize arg0/arg1 into value form, and check valid
   ExprResult PtrRes =
       SemaRef.DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
+  ExprResult ValRes =
+      SemaRef.DefaultFunctionArrayLvalueConversion(TheCall->getArg(1));
 
-  // Bail if conversion failed.
   if (PtrRes.isInvalid())
     return true;
 
+  if (ValRes.isInvalid())
+    return true;
+
   TheCall->setArg(0, PtrRes.get());
   Expr *PointerArg = PtrRes.get();
+  QualType PtrType = PointerArg->getType();
 
   // Check arg 0 is a pointer type, err out if not
-  const PointerType *PointerTy = PointerArg->getType()->getAs<PointerType>();
+  const PointerType *PointerTy = PtrType->getAs<PointerType>();
   if (!PointerTy) {
     SemaRef.Diag(Loc, diag::err_atomic_builtin_must_be_pointer)
-        << PointerArg->getType() << 0 << PointerArg->getSourceRange();
+        << PtrType << 0 << PointerArg->getSourceRange();
     return true;
   }
 
-  // Reject const-qualified pointee types, with an error
+  // Reject const-qualified pointee types
   QualType ValType = PointerTy->getPointeeType();
   if (ValType.isConstQualified()) {
     SemaRef.Diag(Loc, diag::err_atomic_builtin_cannot_be_const)
-        << PointerArg->getType() << PointerArg->getSourceRange();
+        << PtrType << PointerArg->getSourceRange();
     return true;
   }
 
-  // Only integer element types are supported.
   ValType = ValType.getUnqualifiedType();
-  if (!ValType->isIntegerType()) {
-    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_type)
-        << PointerArg->getType() << PointerArg->getSourceRange();
-    return true;
+  bool BadInt = true;
+  if (ValType->isIntegerType()) {
+    unsigned Bits = Context.getTypeSize(ValType);
+    switch (Bits) {
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      BadInt = false;
+      break;
+    default:
+      break;
+    }
   }
 
-  // Only 8/16/32/64-bit integers are supported.
-  unsigned Bits = Context.getTypeSize(ValType);
-  switch (Bits) {
-  case 8:
-  case 16:
-  case 32:
-  case 64:
-    break;
-  default:
+  // Only 8/16/32/64-bit integers are supported
+  if (BadInt) {
     SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_type)
-        << PointerArg->getType() << PointerArg->getSourceRange();
+        << PtrType << PointerArg->getSourceRange();
     return true;
   }
 
-  ExprResult ValRes =
-      SemaRef.DefaultFunctionArrayLvalueConversion(TheCall->getArg(1));
-
-  // Bail if conversion failed.
-  if (ValRes.isInvalid())
-    return true;
-
   Expr *ValArg = ValRes.get();
   QualType ValArgType = ValArg->getType().getUnqualifiedType();
 
@@ -1184,21 +1184,6 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
     return true;
   }
 
-  // Prepare a cast if the value type differs
-  ExprResult ValArgRes;
-  CastKind CK =
-      ValArg->getType().getCanonicalType() == ValType.getCanonicalType()
-          ? CK_NoOp
-          : CK_IntegralCast;
-
-  // Apply cast to the pointee type.
-  ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK);
-
-  // Bail if cast failed.
-  if (ValArgRes.isInvalid())
-    return true;
-
-  TheCall->setArg(1, ValArgRes.get());
   Expr *OrderArg = TheCall->getArg(2);
 
   // Defer validation for dependent memory_order arguments.
@@ -1214,18 +1199,28 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
     return true;
   }
 
-  llvm::APSInt OrderVal;
-  if (SemaRef.BuiltinConstantArg(TheCall, 2, OrderVal))
-    return true;
-
   // __ATOMIC_RELAXED=0, __ATOMIC_RELEASE=3, __ATOMIC_SEQ_CST=5.
-  int64_t Order = OrderVal.getSExtValue();
+  int64_t Order = OrderValOpt->getSExtValue();
   if (Order != 0 && Order != 3 && Order != 5) {
     SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_order)
         << OrderArg->getSourceRange();
     return true;
   }
 
+  // Prepare a cast if the value type differs
+  ExprResult ValArgRes;
+  CastKind CK =
+      ValArg->getType().getCanonicalType() == ValType.getCanonicalType()
+          ? CK_NoOp
+          : CK_IntegralCast;
+
+  // Apply cast to the pointee type, bail if cast failed
+  ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK);
+  if (ValArgRes.isInvalid())
+    return true;
+
+  TheCall->setArg(1, ValArgRes.get());
+
   // Arg 3 (retention policy) must be between KEEP(0) and STRM(1).
   return SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 1);
 }
diff --git a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
index e87ef3253a6cc..f48f1d6344bc5 100644
--- a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
@@ -12,7 +12,8 @@
 // CHECK-NEXT:    store i8 [[V]], ptr [[V_ADDR]], align 1
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[V_ADDR]], align 1
-// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr [[TMP0]], i8 [[TMP1]], i32 0, i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i64
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], i64 [[TMP2]], i32 0, i32 0, i32 8)
 // CHECK-NEXT:    ret void
 //
 void test_u8(unsigned char *p, unsigned char v) {
@@ -28,7 +29,8 @@ void test_u8(unsigned char *p, unsigned char v) {
 // CHECK-NEXT:    store i16 [[V]], ptr [[V_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[V_ADDR]], align 2
-// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr [[TMP0]], i16 [[TMP1]], i32 3, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i64
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], i64 [[TMP2]], i32 3, i32 1, i32 16)
 // CHECK-NEXT:    ret void
 //
 void test_u16(unsigned short *p, unsigned short v) {
@@ -44,7 +46,8 @@ void test_u16(unsigned short *p, unsigned short v) {
 // CHECK-NEXT:    store i32 [[V]], ptr [[V_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr [[TMP0]], i32 [[TMP1]], i32 5, i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], i64 [[TMP2]], i32 5, i32 0, i32 32)
 // CHECK-NEXT:    ret void
 //
 void test_u32(unsigned int *p, unsigned int v) {
@@ -60,7 +63,7 @@ void test_u32(unsigned int *p, unsigned int v) {
 // CHECK-NEXT:    store i64 [[V]], ptr [[V_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[V_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr [[TMP0]], i64 [[TMP1]], i32 0, i32 1)
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], i64 [[TMP1]], i32 0, i32 1, i32 64)
 // CHECK-NEXT:    ret void
 //
 void test_u64(unsigned long *p, unsigned long v) {
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index d902eb08a9cc7..9aa62b0b45e86 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -1826,7 +1826,7 @@ int test_rndrrs(uint64_t *__addr) {
 
 // AArch64-LABEL: @test_stshh_atomic_store(
 // AArch64-NEXT:  entry:
-// AArch64-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 0)
+// AArch64:         call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 {{.*}}, i32 0, i32 0, i32 32)
 // AArch64-NEXT:    ret void
 //
 void test_stshh_atomic_store(int *p, int v) {
diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c
index 6a1cb845e41fa..5344a2c5c6c5b 100644
--- a/clang/test/CodeGen/builtins-arm64.c
+++ b/clang/test/CodeGen/builtins-arm64.c
@@ -41,7 +41,7 @@ void hints(void) {
 
 void stshh_atomic_store(int *p, int v) {
   __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // CHECK: call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr {{.*}}, i32 {{.*}}, i32 0, i32 0)
+  // CHECK: call void @llvm.aarch64.stshh.atomic.store.p0(ptr {{.*}}, i64 {{.*}}, i32 0, i32 0, i32 32)
 }
 
 void barriers(void) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a8ec48da7a8b4..75929cbc222ad 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -64,9 +64,10 @@ def int_aarch64_frint64x
 def int_aarch64_hint : DefaultAttrsIntrinsic<[], [llvm_i32_ty]>;
 def int_aarch64_stshh_atomic_store
     : Intrinsic<[],
-                [llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty],
+                [llvm_anyptr_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                 llvm_i32_ty],
                 [IntrHasSideEffects, ImmArg<ArgIndex<2>>,
-                 ImmArg<ArgIndex<3>>]>;
+                 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
 def int_aarch64_break : Intrinsic<[], [llvm_i32_ty],
     [IntrNoMem, IntrHasSideEffects, IntrNoReturn, IntrCold, ImmArg<ArgIndex<0>>]>;
@@ -4292,3 +4293,4 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
       [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
 }
+
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 3cbcd80c4c627..78f318d17786e 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1011,26 +1011,27 @@ bool AArch64ExpandPseudo::expandSTSHHAtomicStore(
 
   unsigned Order = MI.getOperand(2).getImm();
   uint64_t Policy = MI.getOperand(3).getImm();
+  unsigned Size = MI.getOperand(4).getImm();
 
   bool IsRelaxed = Order == 0;
   unsigned StoreOpc = 0;
 
-  // __ATOMIC_RELAXED uses STR. __ATOMIC_{RELEASE/SEQ_CST} use STLR
-  switch (MI.getOpcode()) {
-  case AArch64::STSHH_ATOMIC_STORE_B:
+  // __ATOMIC_RELAXED uses STR. __ATOMIC_{RELEASE/SEQ_CST} use STLR.
+  switch (Size) {
+  case 8:
     StoreOpc = IsRelaxed ? AArch64::STRBBui : AArch64::STLRB;
     break;
-  case AArch64::STSHH_ATOMIC_STORE_H:
+  case 16:
     StoreOpc = IsRelaxed ? AArch64::STRHHui : AArch64::STLRH;
     break;
-  case AArch64::STSHH_ATOMIC_STORE_W:
+  case 32:
     StoreOpc = IsRelaxed ? AArch64::STRWui : AArch64::STLRW;
     break;
-  case AArch64::STSHH_ATOMIC_STORE_X:
+  case 64:
     StoreOpc = IsRelaxed ? AArch64::STRXui : AArch64::STLRX;
     break;
   default:
-    llvm_unreachable("Unexpected STSHH atomic store pseudo");
+    llvm_unreachable("Unexpected STSHH atomic store size");
   }
 
   // Emit the hint with the retention policy immediate.
@@ -1039,8 +1040,19 @@ bool AArch64ExpandPseudo::expandSTSHHAtomicStore(
                            .getInstr();
 
   // Emit the associated store instruction.
+  Register ValReg = MI.getOperand(0).getReg();
+  Register StoreValReg = ValReg;
+  bool UsesXReg = StoreOpc == AArch64::STRXui || StoreOpc == AArch64::STLRX;
+  if (!UsesXReg) {
+    const TargetRegisterInfo *TRI =
+        MBB.getParent()->getSubtarget().getRegisterInfo();
+    Register SubReg = TRI->getSubReg(ValReg, AArch64::sub_32);
+    if (SubReg)
+      StoreValReg = SubReg;
+  }
+
   MachineInstrBuilder Store = BuildMI(MBB, MBBI, DL, TII->get(StoreOpc))
-                                  .add(MI.getOperand(0))
+                                  .addReg(StoreValReg)
                                   .add(MI.getOperand(1));
 
   // Relaxed uses base+imm addressing with a zero offset.
@@ -1753,10 +1765,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandCALL_BTI(MBB, MBBI);
    case AArch64::StoreSwiftAsyncContext:
      return expandStoreSwiftAsyncContext(MBB, MBBI);
-   case AArch64::STSHH_ATOMIC_STORE_B:
-   case AArch64::STSHH_ATOMIC_STORE_H:
-   case AArch64::STSHH_ATOMIC_STORE_W:
-   case AArch64::STSHH_ATOMIC_STORE_X:
+   case AArch64::STSHH_ATOMIC_STORE_SZ:
      return expandSTSHHAtomicStore(MBB, MBBI);
    case AArch64::RestoreZAPseudo:
    case AArch64::CommitZASavePseudo:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2e3674d86afbe..c805fa116d973 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2121,8 +2121,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
   }
 
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   if (Subtarget->hasSVE()) {
@@ -6368,79 +6366,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                        Op.getOperand(0),                        // Chain
                        DAG.getTargetConstant(24, DL, MVT::i32), // Rt
                        Op.getOperand(2));                       // Addr
-  case Intrinsic::aarch64_stshh_atomic_store: {
-    SDValue Chain = Op.getOperand(0);
-    SDValue Ptr = Op.getOperand(2);
-    SDValue Val = Op.getOperand(3);
-    auto *OrderC = cast<ConstantSDNode>(Op.getOperand(4));
-    auto *PolicyC = cast<ConstantSDNode>(Op.getOperand(5));
-    uint64_t OrderVal = OrderC->getZExtValue();
-
-    unsigned SizeBits = Val.getValueType().getSizeInBits();
-    if (SizeBits < 8)
-      SizeBits = 8;
-    unsigned PseudoOpc = 0;
-    // Select pseudo opcode based on value size.
-    switch (SizeBits) {
-    case 8:
-      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_B;
-      break;
-    case 16:
-      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_H;
-      break;
-    case 32:
-      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_W;
-      break;
-    case 64:
-      PseudoOpc = AArch64::STSHH_ATOMIC_STORE_X;
-      break;
-    default:
-      llvm_unreachable("Unexpected STSHH atomic store size");
-    }
-
-    // Extend or truncate value to expected store width
-    if (SizeBits <= 32)
-      Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i32);
-    else
-      Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
-
-    SDValue Order = DAG.getTargetConstant(OrderVal, DL, MVT::i32);
-    SDValue Policy =
-        DAG.getTargetConstant(PolicyC->getZExtValue(), DL, MVT::i32);
-
-    // Build pseudo which expands to STSHH + atomic store.
-    SDValue Ops[] = {Val, Ptr, Order, Policy, Chain};
-    MachineSDNode *N = DAG.getMachineNode(PseudoOpc, DL, MVT::Other, Ops);
-
-    // Select correct memory ordering for the store
-    AtomicOrdering Ordering;
-    switch (OrderVal) {
-    case 0: // __ATOMIC_RELAXED
-      Ordering = AtomicOrdering::Monotonic;
-      break;
-    case 3: // __ATOMIC_RELEASE
-      Ordering = AtomicOrdering::Release;
-      break;
-    case 5: // __ATOMIC_SEQ_CST
-      Ordering = AtomicOrdering::SequentiallyConsistent;
-      break;
-    default:
-      llvm_unreachable("Unexpected memory order for STSHH atomic store");
-    }
-
-    LLVMContext &Ctx = *DAG.getContext();
-    EVT MemVT = EVT::getIntegerVT(Ctx, SizeBits);
-    Type *MemTy = MemVT.getTypeForEVT(Ctx);
-    Align Alignment = DAG.getDataLayout().getABITypeAlign(MemTy);
-    uint64_t Size = MemVT.getStoreSize();
-
-    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MachinePointerInfo(), MachineMemOperand::MOStore, Size, Alignment,
-        AAMDNodes(), nullptr, SyncScope::System, Ordering);
-
-    DAG.setNodeMemRefs(N, {MMO});
-    return SDValue(N, 0);
-  }
   case Intrinsic::aarch64_sme_str:
   case Intrinsic::aarch64_sme_ldr: {
     return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
@@ -17868,6 +17793,53 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
     Infos.push_back(Info);
     return;
   }
+  case Intrinsic::aarch64_stshh_atomic_store: {
+    const auto *OrderC = dyn_cast<ConstantInt>(I.getArgOperand(2));
+    const auto *SizeC = dyn_cast<ConstantInt>(I.getArgOperand(4));
+    if (!OrderC || !SizeC)
+      return;
+
+    unsigned SizeBits = SizeC->getZExtValue();
+    switch (SizeBits) {
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      break;
+    default:
+      return;
+    }
+
+    AtomicOrdering Ordering;
+    switch (OrderC->getZExtValue()) {
+    case 0: // __ATOMIC_RELAXED
+      Ordering = AtomicOrdering::Monotonic;
+      break;
+    case 3: // __ATOMIC_RELEASE
+      Ordering = AtomicOrdering::Release;
+      break;
+    case 5: // __ATOMIC_SEQ_CST
+      Ordering = AtomicOrdering::SequentiallyConsistent;
+      break;
+    default:
+      return;
+    }
+
+    // Fill IntrinsicInfo so SelectionDAG builds correctly
+    // typed/aligned atomic store MachineMemOperand.
+    LLVMContext &Ctx = I.getContext();
+    Type *MemTy = IntegerType::get(Ctx, SizeBits);
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = EVT::getIntegerVT(Ctx, SizeBits);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlign(MemTy);
+    Info.flags = MachineMemOperand::MOStore;
+    Info.ssid = SyncScope::System;
+    Info.order = Ordering;
+    Infos.push_back(Info);
+    return;
+  }
   case Intrinsic::aarch64_ldaxp:
   case Intrinsic::aarch64_ldxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index b9ccc41e4432b..ee009fe6ab601 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1849,7 +1849,7 @@ def phint_op : Operand<i32> {
 }
 
 class STSHHI
-    : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy">,
+    : SimpleSystemI<0, (ins phint_op:$policy), "stshh", "\t$policy", []>,
       Sched<[WriteHint]> {
   bits<3> policy;
   let Inst{20-12} = 0b000011001;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c2685fcb903ca..e079cf46f71be 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1577,21 +1577,19 @@ def : InstAlias<"nop", (NOP)>;
 
 def STSHH: STSHHI;
 
-let hasSideEffects = 1, mayStore = 1, isPseudo = 1, isCodeGenOnly = 1 in {
-def STSHH_ATOMIC_STORE_B
-    : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
-                          i32imm:$policy), []>, Sched<[]>;
-def STSHH_ATOMIC_STORE_H
-    : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
-                          i32imm:$policy), []>, Sched<[]>;
-def STSHH_ATOMIC_STORE_W
-    : Pseudo<(outs), (ins GPR32:$val, GPR64sp:$addr, i32imm:$order,
-                          i32imm:$policy), []>, Sched<[]>;
-def STSHH_ATOMIC_STORE_X
+let hasSideEffects = 1, mayStore = 1, isCodeGenOnly = 1 in {
+let Size = 8 in
+def STSHH_ATOMIC_STORE_SZ
     : Pseudo<(outs), (ins GPR64:$val, GPR64sp:$addr, i32imm:$order,
-                          i32imm:$policy), []>, Sched<[]>;
+                          i32imm:$policy, i32imm:$size), []>,
+             Sched<[WriteAtomic]>;
 }
 
+def : Pat<(int_aarch64_stshh_atomic_store GPR64sp:$addr, GPR64:$val,
+           (i32 timm:$order), (i32 timm:$policy), (i32 timm:$size)),
+          (STSHH_ATOMIC_STORE_SZ GPR64:$val, GPR64sp:$addr, (i32 timm:$order),
+           (i32 timm:$policy), (i32 timm:$size))>;
+
 // In order to be able to write readable assembly, LLVM should accept assembly
 // inputs that use Branch Target Identification mnemonics, even with BTI disabled.
 // However, in order to be compatible with other assemblers (e.g. GAS), LLVM
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index 67b89112f91a7..aaa440d834c74 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -1,53 +1,66 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v9.6a -global-isel=1 < %s | FileCheck %s
 
 define void @test_keep_relaxed(ptr %p, i32 %v) {
 ; CHECK-LABEL: test_keep_relaxed:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    str w1, [x0]
+; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 0)
+  %v64 = zext i32 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 32)
   ret void
 }
 
 define void @test_strm_release(ptr %p, i32 %v) {
 ; CHECK-LABEL: test_strm_release:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlr w1, [x0]
+; CHECK-NEXT:    stlr w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 3, i32 1)
+  %v64 = zext i32 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 32)
   ret void
 }
 
 define void @test_keep_i8(ptr %p, i8 %v) {
 ; CHECK-LABEL: test_keep_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    strb w1, [x0]
+; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 0, i32 0)
+  %v64 = zext i8 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 8)
   ret void
 }
 
 define void @test_keep_i16(ptr %p, i16 %v) {
 ; CHECK-LABEL: test_keep_i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    strh w1, [x0]
+; CHECK-NEXT:    strh w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 0, i32 0)
+  %v64 = zext i16 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 16)
   ret void
 }
 
 define void @test_keep_i32(ptr %p, i32 %v) {
 ; CHECK-LABEL: test_keep_i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    str w1, [x0]
+; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 0)
+  %v64 = zext i32 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 32)
   ret void
 }
 
@@ -57,37 +70,45 @@ define void @test_keep_i64(ptr %p, i64 %v) {
 ; CHECK-NEXT:    stshh keep
 ; CHECK-NEXT:    str x1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 0, i32 0)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 64)
   ret void
 }
 
 define void @test_strm_i8(ptr %p, i8 %v) {
 ; CHECK-LABEL: test_strm_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    strb w1, [x0]
+; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 0, i32 1)
+  %v64 = zext i8 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 1, i32 8)
   ret void
 }
 
 define void @test_strm_i16(ptr %p, i16 %v) {
 ; CHECK-LABEL: test_strm_i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    strh w1, [x0]
+; CHECK-NEXT:    strh w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 0, i32 1)
+  %v64 = zext i16 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 1, i32 16)
   ret void
 }
 
 define void @test_strm_i32(ptr %p, i32 %v) {
 ; CHECK-LABEL: test_strm_i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    str w1, [x0]
+; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 0, i32 1)
+  %v64 = zext i32 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 1, i32 32)
   ret void
 }
 
@@ -97,37 +118,45 @@ define void @test_strm_i64(ptr %p, i64 %v) {
 ; CHECK-NEXT:    stshh strm
 ; CHECK-NEXT:    str x1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 0, i32 1)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 1, i32 64)
   ret void
 }
 
 define void @test_strm_release_i8(ptr %p, i8 %v) {
 ; CHECK-LABEL: test_strm_release_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrb w1, [x0]
+; CHECK-NEXT:    stlrb w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 3, i32 1)
+  %v64 = zext i8 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 8)
   ret void
 }
 
 define void @test_strm_release_i16(ptr %p, i16 %v) {
 ; CHECK-LABEL: test_strm_release_i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrh w1, [x0]
+; CHECK-NEXT:    stlrh w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 3, i32 1)
+  %v64 = zext i16 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 16)
   ret void
 }
 
 define void @test_strm_release_i32(ptr %p, i32 %v) {
 ; CHECK-LABEL: test_strm_release_i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlr w1, [x0]
+; CHECK-NEXT:    stlr w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 3, i32 1)
+  %v64 = zext i32 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 32)
   ret void
 }
 
@@ -137,37 +166,45 @@ define void @test_strm_release_i64(ptr %p, i64 %v) {
 ; CHECK-NEXT:    stshh strm
 ; CHECK-NEXT:    stlr x1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 3, i32 1)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 1, i32 64)
   ret void
 }
 
 define void @test_strm_seqcst_i8(ptr %p, i8 %v) {
 ; CHECK-LABEL: test_strm_seqcst_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrb w1, [x0]
+; CHECK-NEXT:    stlrb w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i8(ptr %p, i8 %v, i32 5, i32 1)
+  %v64 = zext i8 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 1, i32 8)
   ret void
 }
 
 define void @test_strm_seqcst_i16(ptr %p, i16 %v) {
 ; CHECK-LABEL: test_strm_seqcst_i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrh w1, [x0]
+; CHECK-NEXT:    stlrh w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i16(ptr %p, i16 %v, i32 5, i32 1)
+  %v64 = zext i16 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 1, i32 16)
   ret void
 }
 
 define void @test_strm_seqcst_i32(ptr %p, i32 %v) {
 ; CHECK-LABEL: test_strm_seqcst_i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlr w1, [x0]
+; CHECK-NEXT:    stlr w8, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i32(ptr %p, i32 %v, i32 5, i32 1)
+  %v64 = zext i32 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 1, i32 32)
   ret void
 }
 
@@ -177,6 +214,6 @@ define void @test_strm_seqcst_i64(ptr %p, i64 %v) {
 ; CHECK-NEXT:    stshh strm
 ; CHECK-NEXT:    stlr x1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0.i64(ptr %p, i64 %v, i32 5, i32 1)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 1, i32 64)
   ret void
 }

>From ac6fd510e652522dac7922281b863175ac7d0f57 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 26 Feb 2026 15:04:17 +0000
Subject: [PATCH 13/18] fixup! Small fixes Kerry has suggested

---
 clang/lib/Sema/SemaARM.cpp                    |  18 +--
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  16 +--
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 132 ++++++++++++++----
 4 files changed, 117 insertions(+), 51 deletions(-)

diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 937432d21f3d5..93eb5fb4b4517 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1125,11 +1125,14 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   ExprResult ValRes =
       SemaRef.DefaultFunctionArrayLvalueConversion(TheCall->getArg(1));
 
-  if (PtrRes.isInvalid())
+  if (PtrRes.isInvalid() || ValRes.isInvalid())
     return true;
 
-  if (ValRes.isInvalid())
-    return true;
+  Expr *OrderArg = TheCall->getArg(2);
+
+  // Defer validation for dependent memory_order arguments.
+  if (OrderArg->isValueDependent())
+    return false;
 
   TheCall->setArg(0, PtrRes.get());
   Expr *PointerArg = PtrRes.get();
@@ -1184,12 +1187,6 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
     return true;
   }
 
-  Expr *OrderArg = TheCall->getArg(2);
-
-  // Defer validation for dependent memory_order arguments.
-  if (OrderArg->isValueDependent())
-    return false;
-
   // Require an order value.
   std::optional<llvm::APSInt> OrderValOpt =
       OrderArg->getIntegerConstantExpr(Context);
@@ -1208,14 +1205,13 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   }
 
   // Prepare a cast if the value type differs
-  ExprResult ValArgRes;
   CastKind CK =
       ValArg->getType().getCanonicalType() == ValType.getCanonicalType()
           ? CK_NoOp
           : CK_IntegralCast;
 
   // Apply cast to the pointee type, bail if cast failed
-  ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK);
+  ExprResult ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK);
   if (ValArgRes.isInvalid())
     return true;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 78f318d17786e..abb69fbbae916 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1010,7 +1010,7 @@ bool AArch64ExpandPseudo::expandSTSHHAtomicStore(
   DebugLoc DL(MI.getDebugLoc());
 
   unsigned Order = MI.getOperand(2).getImm();
-  uint64_t Policy = MI.getOperand(3).getImm();
+  unsigned Policy = MI.getOperand(3).getImm();
   unsigned Size = MI.getOperand(4).getImm();
 
   bool IsRelaxed = Order == 0;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c805fa116d973..f6341addecb6e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17810,16 +17810,15 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
       return;
     }
 
-    AtomicOrdering Ordering;
-    switch (OrderC->getZExtValue()) {
-    case 0: // __ATOMIC_RELAXED
-      Ordering = AtomicOrdering::Monotonic;
+    switch (static_cast<AtomicOrderingCABI>(OrderC->getZExtValue())) {
+    case AtomicOrderingCABI::relaxed:
+      Info.order = AtomicOrdering::Monotonic;
       break;
-    case 3: // __ATOMIC_RELEASE
-      Ordering = AtomicOrdering::Release;
+    case AtomicOrderingCABI::release:
+      Info.order = AtomicOrdering::Release;
       break;
-    case 5: // __ATOMIC_SEQ_CST
-      Ordering = AtomicOrdering::SequentiallyConsistent;
+    case AtomicOrderingCABI::seq_cst:
+      Info.order = AtomicOrdering::SequentiallyConsistent;
       break;
     default:
       return;
@@ -17836,7 +17835,6 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
     Info.align = DL.getABITypeAlign(MemTy);
     Info.flags = MachineMemOperand::MOStore;
     Info.ssid = SyncScope::System;
-    Info.order = Ordering;
     Infos.push_back(Info);
     return;
   }
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index aaa440d834c74..f952ec689b685 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -2,8 +2,34 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a -global-isel=1 < %s | FileCheck %s
 
-define void @test_keep_relaxed(ptr %p, i32 %v) {
-; CHECK-LABEL: test_keep_relaxed:
+define void @test_keep_relaxed_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_keep_relaxed_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+  %v64 = zext i8 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 8)
+  ret void
+}
+
+define void @test_keep_relaxed_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_keep_relaxed_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xffff
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    ret
+  %v64 = zext i16 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 16)
+  ret void
+}
+
+define void @test_keep_relaxed_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_keep_relaxed_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
@@ -14,68 +40,114 @@ define void @test_keep_relaxed(ptr %p, i32 %v) {
   ret void
 }
 
-define void @test_strm_release(ptr %p, i32 %v) {
-; CHECK-LABEL: test_strm_release:
+define void @test_keep_relaxed_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_keep_relaxed_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    str x1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 64)
+  ret void
+}
+
+define void @test_keep_release_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_keep_release_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    stlrb w8, [x0]
+; CHECK-NEXT:    ret
+  %v64 = zext i8 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 0, i32 8)
+  ret void
+}
+
+define void @test_keep_release_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_keep_release_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xffff
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    stlrh w8, [x0]
+; CHECK-NEXT:    ret
+  %v64 = zext i16 %v to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 0, i32 16)
+  ret void
+}
+
+define void @test_keep_release_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_keep_release_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    stshh strm
+; CHECK-NEXT:    stshh keep
 ; CHECK-NEXT:    stlr w8, [x0]
 ; CHECK-NEXT:    ret
   %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 0, i32 32)
+  ret void
+}
+
+define void @test_keep_release_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_keep_release_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    stlr x1, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 0, i32 64)
   ret void
 }
 
-define void @test_keep_i8(ptr %p, i8 %v) {
-; CHECK-LABEL: test_keep_i8:
+define void @test_keep_seqcst_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_keep_seqcst_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    stlrb w8, [x0]
 ; CHECK-NEXT:    ret
   %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 0, i32 8)
   ret void
 }
 
-define void @test_keep_i16(ptr %p, i16 %v) {
-; CHECK-LABEL: test_keep_i16:
+define void @test_keep_seqcst_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_keep_seqcst_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    stlrh w8, [x0]
 ; CHECK-NEXT:    ret
   %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 0, i32 16)
   ret void
 }
 
-define void @test_keep_i32(ptr %p, i32 %v) {
-; CHECK-LABEL: test_keep_i32:
+define void @test_keep_seqcst_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_keep_seqcst_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    stlr w8, [x0]
 ; CHECK-NEXT:    ret
   %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 0, i32 32)
   ret void
 }
 
-define void @test_keep_i64(ptr %p, i64 %v) {
-; CHECK-LABEL: test_keep_i64:
+define void @test_keep_seqcst_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_keep_seqcst_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    str x1, [x0]
+; CHECK-NEXT:    stlr x1, [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 64)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 0, i32 64)
   ret void
 }
 
-define void @test_strm_i8(ptr %p, i8 %v) {
-; CHECK-LABEL: test_strm_i8:
+define void @test_strm_relaxed_i8(ptr %p, i8 %v) {
+; CHECK-LABEL: test_strm_relaxed_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    and x8, x1, #0xff
@@ -87,8 +159,8 @@ define void @test_strm_i8(ptr %p, i8 %v) {
   ret void
 }
 
-define void @test_strm_i16(ptr %p, i16 %v) {
-; CHECK-LABEL: test_strm_i16:
+define void @test_strm_relaxed_i16(ptr %p, i16 %v) {
+; CHECK-LABEL: test_strm_relaxed_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    and x8, x1, #0xffff
@@ -100,8 +172,8 @@ define void @test_strm_i16(ptr %p, i16 %v) {
   ret void
 }
 
-define void @test_strm_i32(ptr %p, i32 %v) {
-; CHECK-LABEL: test_strm_i32:
+define void @test_strm_relaxed_i32(ptr %p, i32 %v) {
+; CHECK-LABEL: test_strm_relaxed_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
@@ -112,8 +184,8 @@ define void @test_strm_i32(ptr %p, i32 %v) {
   ret void
 }
 
-define void @test_strm_i64(ptr %p, i64 %v) {
-; CHECK-LABEL: test_strm_i64:
+define void @test_strm_relaxed_i64(ptr %p, i64 %v) {
+; CHECK-LABEL: test_strm_relaxed_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stshh strm
 ; CHECK-NEXT:    str x1, [x0]

>From 79c05bb4dec40acb0a37480faf232310f661d701 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 26 Feb 2026 17:15:15 +0000
Subject: [PATCH 14/18] fixup! Fixes for Caroline

---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  3 ---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  4 +--
 .../Target/AArch64/AArch64ISelLowering.cpp    | 13 +--------
 .../pcdphint-atomic-store-diagnostic.ll       | 27 +++++++++++++++++++
 4 files changed, 30 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 3395d322b434d..ef17dd49639c3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5283,9 +5283,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     auto *OrderC = dyn_cast<llvm::ConstantInt>(Order);
     auto *PolicyC = dyn_cast<llvm::ConstantInt>(Policy);
 
-    assert(OrderC && PolicyC &&
-           "order/policy must be constant for __arm_atomic_store_with_stshh");
-
     // Validate ordering argument; bail out if invalid
     switch (OrderC->getZExtValue()) {
     case 0: // __ATOMIC_RELAXED
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index abb69fbbae916..739c1ff53855e 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1042,8 +1042,8 @@ bool AArch64ExpandPseudo::expandSTSHHAtomicStore(
   // Emit the associated store instruction.
   Register ValReg = MI.getOperand(0).getReg();
   Register StoreValReg = ValReg;
-  bool UsesXReg = StoreOpc == AArch64::STRXui || StoreOpc == AArch64::STLRX;
-  if (!UsesXReg) {
+
+  if (Size < 64) {
     const TargetRegisterInfo *TRI =
         MBB.getParent()->getSubtarget().getRegisterInfo();
     Register SubReg = TRI->getSubReg(ValReg, AArch64::sub_32);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f6341addecb6e..82bccb60c0de1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17799,17 +17799,6 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
     if (!OrderC || !SizeC)
       return;
 
-    unsigned SizeBits = SizeC->getZExtValue();
-    switch (SizeBits) {
-    case 8:
-    case 16:
-    case 32:
-    case 64:
-      break;
-    default:
-      return;
-    }
-
     switch (static_cast<AtomicOrderingCABI>(OrderC->getZExtValue())) {
     case AtomicOrderingCABI::relaxed:
       Info.order = AtomicOrdering::Monotonic;
@@ -17827,6 +17816,7 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
     // Fill IntrinsicInfo so SelectionDAG builds correctly
     // typed/aligned atomic store MachineMemOperand.
     LLVMContext &Ctx = I.getContext();
+    unsigned SizeBits = SizeC->getZExtValue();
     Type *MemTy = IntegerType::get(Ctx, SizeBits);
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = EVT::getIntegerVT(Ctx, SizeBits);
@@ -17834,7 +17824,6 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
     Info.offset = 0;
     Info.align = DL.getABITypeAlign(MemTy);
     Info.flags = MachineMemOperand::MOStore;
-    Info.ssid = SyncScope::System;
     Infos.push_back(Info);
     return;
   }
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll
new file mode 100644
index 0000000000000..0c3ba763bff25
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll
@@ -0,0 +1,27 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as %t/bad-value-type.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIG
+; RUN: not llvm-as %t/bad-order-type.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIG
+; RUN: not llvm-as %t/bad-pointer-type.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIG
+
+; INVALID-SIG: error: invalid intrinsic signature
+
+;--- bad-value-type.ll
+; The intrinsic's value operand must be i64.
+define void @bad_value_type(ptr %p, i32 %v) {
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i32 %v, i32 0, i32 0, i32 32)
+  ret void
+}
+
+;--- bad-order-type.ll
+; The order operand type must be i32.
+define void @bad_order_type(ptr %p, i64 %v) {
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i64 0, i32 0, i32 64)
+  ret void
+}
+
+;--- bad-pointer-type.ll
+; The pointer operand must be a pointer type.
+define void @bad_pointer_type(i64 %p, i64 %v) {
+  call void @llvm.aarch64.stshh.atomic.store.p0(i64 %p, i64 %v, i32 0, i32 0, i32 64)
+  ret void
+}

>From 4b3554b99b3e195397cde1f87aecaf8e4b2531f5 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 26 Feb 2026 17:33:49 +0000
Subject: [PATCH 15/18] fixup! Sort out testcases

---
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 156 ++++++------------
 1 file changed, 54 insertions(+), 102 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index f952ec689b685..6e48cb348ca05 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -2,41 +2,33 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64 -mattr=+v9.6a -global-isel=1 < %s | FileCheck %s
 
-define void @test_keep_relaxed_i8(ptr %p, i8 %v) {
+define void @test_keep_relaxed_i8(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_relaxed_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    strb w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 8)
   ret void
 }
 
-define void @test_keep_relaxed_i16(ptr %p, i16 %v) {
+define void @test_keep_relaxed_i16(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_relaxed_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    strh w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 16)
   ret void
 }
 
-define void @test_keep_relaxed_i32(ptr %p, i32 %v) {
+define void @test_keep_relaxed_i32(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_relaxed_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    str w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 0, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 32)
   ret void
 }
 
@@ -50,41 +42,33 @@ define void @test_keep_relaxed_i64(ptr %p, i64 %v) {
   ret void
 }
 
-define void @test_keep_release_i8(ptr %p, i8 %v) {
+define void @test_keep_release_i8(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_release_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    stlrb w8, [x0]
+; CHECK-NEXT:    stlrb w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 0, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 0, i32 8)
   ret void
 }
 
-define void @test_keep_release_i16(ptr %p, i16 %v) {
+define void @test_keep_release_i16(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_release_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    stlrh w8, [x0]
+; CHECK-NEXT:    stlrh w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 0, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 0, i32 16)
   ret void
 }
 
-define void @test_keep_release_i32(ptr %p, i32 %v) {
+define void @test_keep_release_i32(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_release_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    stlr w8, [x0]
+; CHECK-NEXT:    stlr w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 0, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 0, i32 32)
   ret void
 }
 
@@ -98,41 +82,33 @@ define void @test_keep_release_i64(ptr %p, i64 %v) {
   ret void
 }
 
-define void @test_keep_seqcst_i8(ptr %p, i8 %v) {
+define void @test_keep_seqcst_i8(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_seqcst_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    stlrb w8, [x0]
+; CHECK-NEXT:    stlrb w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 0, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 0, i32 8)
   ret void
 }
 
-define void @test_keep_seqcst_i16(ptr %p, i16 %v) {
+define void @test_keep_seqcst_i16(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_seqcst_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    stlrh w8, [x0]
+; CHECK-NEXT:    stlrh w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 0, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 0, i32 16)
   ret void
 }
 
-define void @test_keep_seqcst_i32(ptr %p, i32 %v) {
+define void @test_keep_seqcst_i32(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_seqcst_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh keep
-; CHECK-NEXT:    stlr w8, [x0]
+; CHECK-NEXT:    stlr w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 0, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 0, i32 32)
   ret void
 }
 
@@ -146,41 +122,33 @@ define void @test_keep_seqcst_i64(ptr %p, i64 %v) {
   ret void
 }
 
-define void @test_strm_relaxed_i8(ptr %p, i8 %v) {
+define void @test_strm_relaxed_i8(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_relaxed_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    strb w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 1, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 1, i32 8)
   ret void
 }
 
-define void @test_strm_relaxed_i16(ptr %p, i16 %v) {
+define void @test_strm_relaxed_i16(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_relaxed_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    strh w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 1, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 1, i32 16)
   ret void
 }
 
-define void @test_strm_relaxed_i32(ptr %p, i32 %v) {
+define void @test_strm_relaxed_i32(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_relaxed_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    str w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 0, i32 1, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 1, i32 32)
   ret void
 }
 
@@ -194,41 +162,33 @@ define void @test_strm_relaxed_i64(ptr %p, i64 %v) {
   ret void
 }
 
-define void @test_strm_release_i8(ptr %p, i8 %v) {
+define void @test_strm_release_i8(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_release_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrb w8, [x0]
+; CHECK-NEXT:    stlrb w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 1, i32 8)
   ret void
 }
 
-define void @test_strm_release_i16(ptr %p, i16 %v) {
+define void @test_strm_release_i16(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_release_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrh w8, [x0]
+; CHECK-NEXT:    stlrh w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 1, i32 16)
   ret void
 }
 
-define void @test_strm_release_i32(ptr %p, i32 %v) {
+define void @test_strm_release_i32(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_release_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlr w8, [x0]
+; CHECK-NEXT:    stlr w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 3, i32 1, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 3, i32 1, i32 32)
   ret void
 }
 
@@ -242,41 +202,33 @@ define void @test_strm_release_i64(ptr %p, i64 %v) {
   ret void
 }
 
-define void @test_strm_seqcst_i8(ptr %p, i8 %v) {
+define void @test_strm_seqcst_i8(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_seqcst_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrb w8, [x0]
+; CHECK-NEXT:    stlrb w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i8 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 1, i32 8)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 1, i32 8)
   ret void
 }
 
-define void @test_strm_seqcst_i16(ptr %p, i16 %v) {
+define void @test_strm_seqcst_i16(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_seqcst_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x8, x1, #0xffff
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlrh w8, [x0]
+; CHECK-NEXT:    stlrh w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i16 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 1, i32 16)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 1, i32 16)
   ret void
 }
 
-define void @test_strm_seqcst_i32(ptr %p, i32 %v) {
+define void @test_strm_seqcst_i32(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_strm_seqcst_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    stshh strm
-; CHECK-NEXT:    stlr w8, [x0]
+; CHECK-NEXT:    stlr w1, [x0]
 ; CHECK-NEXT:    ret
-  %v64 = zext i32 %v to i64
-  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v64, i32 5, i32 1, i32 32)
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 5, i32 1, i32 32)
   ret void
 }
 

>From c004dba6244b8d6223996f0541d65cefaf4bff2b Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 27 Feb 2026 01:32:12 +0000
Subject: [PATCH 16/18] fixup! More small fixes

---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  8 +--
 clang/lib/Headers/arm_acle.h                  |  4 +-
 clang/lib/Sema/SemaARM.cpp                    | 52 ++++++-------------
 clang/test/CodeGen/arm_acle.c                 |  1 -
 .../test/Sema/AArch64/pcdphint-atomic-store.c | 11 ++++
 5 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index ef17dd49639c3..ae95dc2072311 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5280,8 +5280,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Value *Order = EmitScalarExpr(E->getArg(2));
     Value *Policy = EmitScalarExpr(E->getArg(3));
 
-    auto *OrderC = dyn_cast<llvm::ConstantInt>(Order);
-    auto *PolicyC = dyn_cast<llvm::ConstantInt>(Policy);
+    auto *OrderC = cast<llvm::ConstantInt>(Order);
+    auto *PolicyC = cast<llvm::ConstantInt>(Policy);
 
     // Validate ordering argument; bail out if invalid
     switch (OrderC->getZExtValue()) {
@@ -5300,7 +5300,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     unsigned SizeBits = getContext().getTypeSize(ValQT);
     auto *SizeC = llvm::ConstantInt::get(Int32Ty, SizeBits);
 
-    Value *StoreValue64 = Builder.CreateZExtOrTrunc(StoreValue, Int64Ty);
+    const bool IsSignedValType = ValQT->isSignedIntegerType();
+    Value *StoreValue64 =
+        Builder.CreateIntCast(StoreValue, Int64Ty, IsSignedValType);
 
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
                                    {StoreAddr->getType()});
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index fc1af8f1d5a12..499268a3d0237 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -842,8 +842,8 @@ __rndrrs(uint64_t *__p) {
 
 /* Atomic store with PCDPHINT */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-#define __arm_atomic_store_with_stshh(ptr, data, memory_order, ret)            \
-  __builtin_arm_atomic_store_with_stshh(ptr, data, memory_order, ret)
+#define __arm_atomic_store_with_stshh(ptr, data, memory_order, retention_policy)            \
+  __builtin_arm_atomic_store_with_stshh(ptr, data, memory_order, retention_policy)
 #endif
 
 /* 11.2 Guarded Control Stack intrinsics */
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 93eb5fb4b4517..9725575bfa696 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1111,10 +1111,6 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
                                                  CallExpr *TheCall) {
   Sema &SemaRef = S.SemaRef;
   ASTContext &Context = S.getASTContext();
-  DeclRefExpr *DRE =
-      cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
-  SourceLocation Loc = DRE->getBeginLoc();
-
   // Ensure we have the proper number of arguments.
   if (SemaRef.checkArgCount(TheCall, 4))
     return true;
@@ -1129,19 +1125,20 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
     return true;
 
   Expr *OrderArg = TheCall->getArg(2);
+  TheCall->setArg(0, PtrRes.get());
+  TheCall->setArg(1, ValRes.get());
 
   // Defer validation for dependent memory_order arguments.
   if (OrderArg->isValueDependent())
     return false;
 
-  TheCall->setArg(0, PtrRes.get());
   Expr *PointerArg = PtrRes.get();
   QualType PtrType = PointerArg->getType();
 
   // Check arg 0 is a pointer type, err out if not
   const PointerType *PointerTy = PtrType->getAs<PointerType>();
   if (!PointerTy) {
-    SemaRef.Diag(Loc, diag::err_atomic_builtin_must_be_pointer)
+    SemaRef.Diag(PointerArg->getBeginLoc(), diag::err_atomic_builtin_must_be_pointer)
         << PtrType << 0 << PointerArg->getSourceRange();
     return true;
   }
@@ -1149,40 +1146,26 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // Reject const-qualified pointee types
   QualType ValType = PointerTy->getPointeeType();
   if (ValType.isConstQualified()) {
-    SemaRef.Diag(Loc, diag::err_atomic_builtin_cannot_be_const)
+    SemaRef.Diag(PointerArg->getBeginLoc(), diag::err_atomic_builtin_cannot_be_const)
         << PtrType << PointerArg->getSourceRange();
     return true;
   }
 
   ValType = ValType.getUnqualifiedType();
-  bool BadInt = true;
-  if (ValType->isIntegerType()) {
-    unsigned Bits = Context.getTypeSize(ValType);
-    switch (Bits) {
-    case 8:
-    case 16:
-    case 32:
-    case 64:
-      BadInt = false;
-      break;
-    default:
-      break;
-    }
-  }
-
-  // Only 8/16/32/64-bit integers are supported
-  if (BadInt) {
-    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_type)
+  unsigned Bits = Context.getTypeSize(ValType);
+  if (!ValType->isIntegerType() ||
+      (Bits != 8 && Bits != 16 && Bits != 32 && Bits != 64)) {
+    SemaRef.Diag(PointerArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_type)
         << PtrType << PointerArg->getSourceRange();
     return true;
   }
 
-  Expr *ValArg = ValRes.get();
+  Expr *ValArg = TheCall->getArg(1);
   QualType ValArgType = ValArg->getType().getUnqualifiedType();
 
   // Check value type and width
   if (!Context.hasSameType(ValArgType, ValType)) {
-    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_value_type)
+    SemaRef.Diag(ValArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_value_type)
         << ValType << ValArg->getType() << ValArg->getSourceRange();
     return true;
   }
@@ -1191,7 +1174,7 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   std::optional<llvm::APSInt> OrderValOpt =
       OrderArg->getIntegerConstantExpr(Context);
   if (!OrderValOpt) {
-    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_order)
+    SemaRef.Diag(OrderArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_order)
         << OrderArg->getSourceRange();
     return true;
   }
@@ -1199,19 +1182,14 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // __ATOMIC_RELAXED=0, __ATOMIC_RELEASE=3, __ATOMIC_SEQ_CST=5.
   int64_t Order = OrderValOpt->getSExtValue();
   if (Order != 0 && Order != 3 && Order != 5) {
-    SemaRef.Diag(Loc, diag::err_arm_atomic_store_with_stshh_bad_order)
+    SemaRef.Diag(OrderArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_order)
         << OrderArg->getSourceRange();
     return true;
   }
 
-  // Prepare a cast if the value type differs
-  CastKind CK =
-      ValArg->getType().getCanonicalType() == ValType.getCanonicalType()
-          ? CK_NoOp
-          : CK_IntegralCast;
-
-  // Apply cast to the pointee type, bail if cast failed
-  ExprResult ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK);
+  // Value type already matches ValType above; apply a no-op cast for
+  // consistency with other builtin argument rewriting paths.
+  ExprResult ValArgRes = SemaRef.ImpCastExprToType(ValArg, ValType, CK_NoOp);
   if (ValArgRes.isInvalid())
     return true;
 
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 9aa62b0b45e86..3b97f90e806fc 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -1823,7 +1823,6 @@ int test_rndrrs(uint64_t *__addr) {
 #endif
 
 #if defined(__ARM_64BIT_STATE)
-
 // AArch64-LABEL: @test_stshh_atomic_store(
 // AArch64-NEXT:  entry:
 // AArch64:         call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 {{.*}}, i32 0, i32 0, i32 32)
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index e3314b84c7a1c..fb2109eb21602 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o /dev/null -verify %s
 
 #include <arm_acle.h>
 
@@ -31,6 +32,16 @@ void test_invalid_memory_order(unsigned int *p, unsigned int v) {
   // expected-error at -1 {{memory order argument to '__arm_atomic_store_with_stshh' must be one of __ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST}}
 }
 
+void test_invalid_memory_order_consume(unsigned int *p, unsigned int v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_CONSUME, 0);
+  // expected-error at -1 {{memory order argument to '__arm_atomic_store_with_stshh' must be one of __ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST}}
+}
+
+void test_invalid_memory_order_acq_rel(unsigned int *p, unsigned int v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_ACQ_REL, 0);
+  // expected-error at -1 {{memory order argument to '__arm_atomic_store_with_stshh' must be one of __ATOMIC_RELAXED, __ATOMIC_RELEASE, or __ATOMIC_SEQ_CST}}
+}
+
 void test_value_size_mismatch(int *p, short v) {
   __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
   // expected-error at -1 {{value argument to '__arm_atomic_store_with_stshh' must be 'int'; got 'short'}}

>From 1300ed5742d0afa235dd37f6897baeb8793c24d3 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 2 Mar 2026 22:30:29 +0000
Subject: [PATCH 17/18] fixup! Address Carol's comments and fix git
 clang-format issues

---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 11 --------
 clang/lib/Headers/arm_acle.h                  |  6 +++--
 clang/lib/Sema/SemaARM.cpp                    | 18 ++++++++-----
 llvm/lib/IR/Verifier.cpp                      | 20 ++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  6 ++---
 .../pcdphint-atomic-store-diagnostic.ll       | 27 +++++++++++++++++++
 6 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index ae95dc2072311..d49151d70b435 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5283,17 +5283,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     auto *OrderC = cast<llvm::ConstantInt>(Order);
     auto *PolicyC = cast<llvm::ConstantInt>(Policy);
 
-    // Validate ordering argument; bail out if invalid
-    switch (OrderC->getZExtValue()) {
-    case 0: // __ATOMIC_RELAXED
-    case 3: // __ATOMIC_RELEASE
-    case 5: // __ATOMIC_SEQ_CST
-      break;
-    default:
-      llvm_unreachable(
-          "unexpected memory order for __arm_atomic_store_with_stshh");
-    }
-
     // Compute pointee bit-width from arg0 and create as i32 constant
     QualType ValQT =
         E->getArg(0)->getType()->castAs<PointerType>()->getPointeeType();
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 499268a3d0237..929c88cf72ef2 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -842,8 +842,10 @@ __rndrrs(uint64_t *__p) {
 
 /* Atomic store with PCDPHINT */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-#define __arm_atomic_store_with_stshh(ptr, data, memory_order, retention_policy)            \
-  __builtin_arm_atomic_store_with_stshh(ptr, data, memory_order, retention_policy)
+#define __arm_atomic_store_with_stshh(ptr, data, memory_order,                 \
+                                      retention_policy)                        \
+  __builtin_arm_atomic_store_with_stshh(ptr, data, memory_order,               \
+                                        retention_policy)
 #endif
 
 /* 11.2 Guarded Control Stack intrinsics */
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 9725575bfa696..48912ef57d30d 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1138,7 +1138,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // Check arg 0 is a pointer type, err out if not
   const PointerType *PointerTy = PtrType->getAs<PointerType>();
   if (!PointerTy) {
-    SemaRef.Diag(PointerArg->getBeginLoc(), diag::err_atomic_builtin_must_be_pointer)
+    SemaRef.Diag(PointerArg->getBeginLoc(),
+                 diag::err_atomic_builtin_must_be_pointer)
         << PtrType << 0 << PointerArg->getSourceRange();
     return true;
   }
@@ -1146,7 +1147,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // Reject const-qualified pointee types
   QualType ValType = PointerTy->getPointeeType();
   if (ValType.isConstQualified()) {
-    SemaRef.Diag(PointerArg->getBeginLoc(), diag::err_atomic_builtin_cannot_be_const)
+    SemaRef.Diag(PointerArg->getBeginLoc(),
+                 diag::err_atomic_builtin_cannot_be_const)
         << PtrType << PointerArg->getSourceRange();
     return true;
   }
@@ -1155,7 +1157,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   unsigned Bits = Context.getTypeSize(ValType);
   if (!ValType->isIntegerType() ||
       (Bits != 8 && Bits != 16 && Bits != 32 && Bits != 64)) {
-    SemaRef.Diag(PointerArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_type)
+    SemaRef.Diag(PointerArg->getBeginLoc(),
+                 diag::err_arm_atomic_store_with_stshh_bad_type)
         << PtrType << PointerArg->getSourceRange();
     return true;
   }
@@ -1165,7 +1168,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
 
   // Check value type and width
   if (!Context.hasSameType(ValArgType, ValType)) {
-    SemaRef.Diag(ValArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_value_type)
+    SemaRef.Diag(ValArg->getBeginLoc(),
+                 diag::err_arm_atomic_store_with_stshh_bad_value_type)
         << ValType << ValArg->getType() << ValArg->getSourceRange();
     return true;
   }
@@ -1174,7 +1178,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   std::optional<llvm::APSInt> OrderValOpt =
       OrderArg->getIntegerConstantExpr(Context);
   if (!OrderValOpt) {
-    SemaRef.Diag(OrderArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_order)
+    SemaRef.Diag(OrderArg->getBeginLoc(),
+                 diag::err_arm_atomic_store_with_stshh_bad_order)
         << OrderArg->getSourceRange();
     return true;
   }
@@ -1182,7 +1187,8 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM &S,
   // __ATOMIC_RELAXED=0, __ATOMIC_RELEASE=3, __ATOMIC_SEQ_CST=5.
   int64_t Order = OrderValOpt->getSExtValue();
   if (Order != 0 && Order != 3 && Order != 5) {
-    SemaRef.Diag(OrderArg->getBeginLoc(), diag::err_arm_atomic_store_with_stshh_bad_order)
+    SemaRef.Diag(OrderArg->getBeginLoc(),
+                 diag::err_arm_atomic_store_with_stshh_bad_order)
         << OrderArg->getSourceRange();
     return true;
   }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f986f5406b2b3..3f92e8a393999 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6871,6 +6871,26 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           Call);
     break;
   }
+  case Intrinsic::aarch64_stshh_atomic_store: {
+    uint64_t Order = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
+    Check(Order == static_cast<uint64_t>(AtomicOrderingCABI::relaxed) ||
+              Order == static_cast<uint64_t>(AtomicOrderingCABI::release) ||
+              Order == static_cast<uint64_t>(AtomicOrderingCABI::seq_cst),
+          "order argument to llvm.aarch64.stshh.atomic.store must be 0, 3 or 5",
+          Call);
+
+    uint64_t Policy = cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue();
+    Check(Policy < 2,
+          "policy argument to llvm.aarch64.stshh.atomic.store must be 0 or 1",
+          Call);
+
+    uint64_t Size = cast<ConstantInt>(Call.getArgOperand(4))->getZExtValue();
+    Check(Size == 8 || Size == 16 || Size == 32 || Size == 64,
+          "size argument to llvm.aarch64.stshh.atomic.store must be 8, 16, "
+          "32 or 64",
+          Call);
+    break;
+  }
   case Intrinsic::callbr_landingpad: {
     const auto *CBR = dyn_cast<CallBrInst>(Call.getOperand(0));
     Check(CBR, "intrinstic requires callbr operand", &Call);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 82bccb60c0de1..c84e0bf7ea4e9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17794,10 +17794,8 @@ void AArch64TargetLowering::getTgtMemIntrinsic(
     return;
   }
   case Intrinsic::aarch64_stshh_atomic_store: {
-    const auto *OrderC = dyn_cast<ConstantInt>(I.getArgOperand(2));
-    const auto *SizeC = dyn_cast<ConstantInt>(I.getArgOperand(4));
-    if (!OrderC || !SizeC)
-      return;
+    const auto *OrderC = cast<ConstantInt>(I.getArgOperand(2));
+    const auto *SizeC = cast<ConstantInt>(I.getArgOperand(4));
 
     switch (static_cast<AtomicOrderingCABI>(OrderC->getZExtValue())) {
     case AtomicOrderingCABI::relaxed:
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll
index 0c3ba763bff25..9091791c50584 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store-diagnostic.ll
@@ -2,8 +2,14 @@
 ; RUN: not llvm-as %t/bad-value-type.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIG
 ; RUN: not llvm-as %t/bad-order-type.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIG
 ; RUN: not llvm-as %t/bad-pointer-type.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIG
+; RUN: not llvm-as %t/bad-order-value.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-ORDER
+; RUN: not llvm-as %t/bad-policy-value.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-POLICY
+; RUN: not llvm-as %t/bad-size-value.ll -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SIZE
 
 ; INVALID-SIG: error: invalid intrinsic signature
+; INVALID-ORDER: order argument to llvm.aarch64.stshh.atomic.store must be 0, 3 or 5
+; INVALID-POLICY: policy argument to llvm.aarch64.stshh.atomic.store must be 0 or 1
+; INVALID-SIZE: size argument to llvm.aarch64.stshh.atomic.store must be 8, 16, 32 or 64
 
 ;--- bad-value-type.ll
 ; The intrinsic's value operand must be i64.
@@ -25,3 +31,24 @@ define void @bad_pointer_type(i64 %p, i64 %v) {
   call void @llvm.aarch64.stshh.atomic.store.p0(i64 %p, i64 %v, i32 0, i32 0, i32 64)
   ret void
 }
+
+;--- bad-order-value.ll
+; The order operand value must be one of 0, 3, or 5.
+define void @bad_order_value(ptr %p, i64 %v) {
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 1, i32 0, i32 64)
+  ret void
+}
+
+;--- bad-policy-value.ll
+; The policy operand value must be 0 (keep) or 1 (strm).
+define void @bad_policy_value(ptr %p, i64 %v) {
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 2, i32 64)
+  ret void
+}
+
+;--- bad-size-value.ll
+; The size operand value must be one of 8, 16, 32, or 64.
+define void @bad_size_value(ptr %p, i64 %v) {
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %v, i32 0, i32 0, i32 0)
+  ret void
+}

>From 2cbe5ee79be1141c78c48c77930807c980721391 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 3 Mar 2026 11:31:27 +0000
Subject: [PATCH 18/18] fixup! More small optimisations Kerry spotted

---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp             | 9 +++------
 llvm/lib/IR/Verifier.cpp                             | 3 +--
 llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 5 ++---
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index d49151d70b435..b0a89b50c7548 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5277,11 +5277,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   if (BuiltinID == clang::AArch64::BI__builtin_arm_atomic_store_with_stshh) {
     Value *StoreAddr = EmitScalarExpr(E->getArg(0));
     Value *StoreValue = EmitScalarExpr(E->getArg(1));
-    Value *Order = EmitScalarExpr(E->getArg(2));
-    Value *Policy = EmitScalarExpr(E->getArg(3));
 
-    auto *OrderC = cast<llvm::ConstantInt>(Order);
-    auto *PolicyC = cast<llvm::ConstantInt>(Policy);
+    auto *OrderC = cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(2)));
+    auto *PolicyC = cast<llvm::ConstantInt>(EmitScalarExpr(E->getArg(3)));
 
     // Compute pointee bit-width from arg0 and create as i32 constant
     QualType ValQT =
@@ -5289,9 +5287,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     unsigned SizeBits = getContext().getTypeSize(ValQT);
     auto *SizeC = llvm::ConstantInt::get(Int32Ty, SizeBits);
 
-    const bool IsSignedValType = ValQT->isSignedIntegerType();
     Value *StoreValue64 =
-        Builder.CreateIntCast(StoreValue, Int64Ty, IsSignedValType);
+        Builder.CreateIntCast(StoreValue, Int64Ty, ValQT->isSignedIntegerType());
 
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
                                    {StoreAddr->getType()});
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3f92e8a393999..3784ee00811f8 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6879,8 +6879,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "order argument to llvm.aarch64.stshh.atomic.store must be 0, 3 or 5",
           Call);
 
-    uint64_t Policy = cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue();
-    Check(Policy < 2,
+    Check(cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue() < 2,
           "policy argument to llvm.aarch64.stshh.atomic.store must be 0 or 1",
           Call);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 739c1ff53855e..3be7d5e606bfa 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1041,18 +1041,17 @@ bool AArch64ExpandPseudo::expandSTSHHAtomicStore(
 
   // Emit the associated store instruction.
   Register ValReg = MI.getOperand(0).getReg();
-  Register StoreValReg = ValReg;
 
   if (Size < 64) {
     const TargetRegisterInfo *TRI =
         MBB.getParent()->getSubtarget().getRegisterInfo();
     Register SubReg = TRI->getSubReg(ValReg, AArch64::sub_32);
     if (SubReg)
-      StoreValReg = SubReg;
+      ValReg = SubReg;
   }
 
   MachineInstrBuilder Store = BuildMI(MBB, MBBI, DL, TII->get(StoreOpc))
-                                  .addReg(StoreValReg)
+                                  .addReg(ValReg)
                                   .add(MI.getOperand(1));
 
   // Relaxed uses base+imm addressing with a zero offset.



More information about the cfe-commits mailing list