[llvm] [IR][AtomicExpand] Add elementwise modifier to atomicrmw; automatically expand for all targets (PR #189517)
Yonah Goldberg via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 30 22:30:35 PDT 2026
https://github.com/YonahGoldberg updated https://github.com/llvm/llvm-project/pull/189517
>From 0075944a40cd709b9a0156b678f1d64dfbf1ab78 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 31 Mar 2026 01:16:28 +0000
Subject: [PATCH 1/3] IR
---
llvm/docs/LangRef.rst | 16 +-
llvm/include/llvm/AsmParser/LLToken.h | 1 +
llvm/include/llvm/Bitcode/LLVMBitCodes.h | 4 +
llvm/include/llvm/CodeGen/TargetLowering.h | 22 +-
llvm/include/llvm/IR/IRBuilder.h | 6 +-
llvm/include/llvm/IR/Instructions.h | 13 +-
llvm/lib/AsmParser/LLLexer.cpp | 1 +
llvm/lib/AsmParser/LLParser.cpp | 46 +-
llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 12 +-
llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 96 +++-
llvm/lib/CodeGen/AtomicExpandPass.cpp | 252 ++++++---
llvm/lib/IR/AsmWriter.cpp | 5 +-
llvm/lib/IR/Instruction.cpp | 1 +
llvm/lib/IR/Instructions.cpp | 14 +-
llvm/lib/IR/Verifier.cpp | 20 +-
llvm/test/Assembler/atomic.ll | 6 +
...micrmw-elementwise-fadd-must-be-fp-type.ll | 8 +
.../invalid-atomicrmw-elementwise-scalar.ll | 8 +
llvm/test/Bitcode/atomicrmw-elementwise.ll | 16 +
.../CodeGen/NVPTX/atomicrmw-elementwise.ll | 506 ++++++++++++++++++
.../AMDGPU/expand-atomic-rmw-elementwise.ll | 138 +++++
.../NVPTX/expand-atomic-rmw-elementwise.ll | 262 +++++++++
.../X86/expand-atomic-rmw-elementwise.ll | 26 +
23 files changed, 1337 insertions(+), 142 deletions(-)
create mode 100644 llvm/test/Assembler/invalid-atomicrmw-elementwise-fadd-must-be-fp-type.ll
create mode 100644 llvm/test/Assembler/invalid-atomicrmw-elementwise-scalar.ll
create mode 100644 llvm/test/Bitcode/atomicrmw-elementwise.ll
create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-elementwise.ll
create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-elementwise.ll
create mode 100644 llvm/test/Transforms/AtomicExpand/NVPTX/expand-atomic-rmw-elementwise.ll
create mode 100644 llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-elementwise.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 13883883d3981..9d6b83235724f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12097,7 +12097,7 @@ Syntax:
::
- atomicrmw [volatile] <operation> ptr <pointer>, <ty> <value> [syncscope("<target-scope>")] <ordering>[, align <alignment>] ; yields ty
+ atomicrmw [volatile] [elementwise] <operation> ptr <pointer>, <ty> <value> [syncscope("<target-scope>")] <ordering>[, align <alignment>] ; yields ty
Overview:
"""""""""
@@ -12144,7 +12144,9 @@ or fixed vector of floating-point type. The type of the '``<pointer>``'
operand must be a pointer to that type. If the ``atomicrmw`` is marked
as ``volatile``, then the optimizer is not allowed to modify the
number or order of execution of this ``atomicrmw`` with other
-:ref:`volatile operations <volatile>`.
+:ref:`volatile operations <volatile>`. If the ``elementwise`` modifier is present,
+then ``<value>`` must be a fixed vector type whose element type is legal for the
+corresponding scalar ``atomicrmw`` operation.
Note: if the alignment is not greater or equal to the size of the `<value>`
type, the atomic operation is likely to require a lock and have poor
@@ -12159,6 +12161,16 @@ isn't specified.
An ``atomicrmw`` instruction can also take an optional
":ref:`syncscope <syncscope>`" argument.
+If the ``elementwise`` modifier is present, the instruction has per-element vector
+atomic semantics. It behaves as if it were expanded into one scalar ``atomicrmw`` per element, executed in an arbitrary order.
+Without ``elementwise``, vector ``atomicrmw`` keeps whole-value atomic semantics.
+
+Targets may implement ``atomicrmw elementwise`` either by lowering it to a
+native elementwise vector atomic, by scalarizing it into per-element scalar
+``atomicrmw`` operations, or by using an existing stronger whole-value atomic
+implementation, as long as the observable semantics are at least as strong as
+the IR definition above.
+
Semantics:
""""""""""
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index c138fb5ccc55b..eed62572d44bd 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -91,6 +91,7 @@ enum Kind {
kw_unwind,
kw_datalayout,
kw_volatile,
+ kw_elementwise,
kw_atomic,
kw_unordered,
kw_monotonic,
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 9162754bbfe1a..95787c595dff7 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -524,6 +524,10 @@ enum RMWOperations {
RMW_FMINIMUMNUM = 22,
};
+enum RMWOperationFlags {
+ RMW_ELEMENTWISE_FLAG = 1 << 5,
+};
+
/// OverflowingBinaryOperatorOptionalFlags - Flags for serializing
/// OverflowingBinaryOperator's SubclassOptionalData contents.
enum OverflowingBinaryOperatorOptionalFlags {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 51c00b2591ecf..71f1c75394c3e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2443,13 +2443,33 @@ class LLVM_ABI TargetLoweringBase {
}
/// Returns how the IR-level AtomicExpand pass should expand the given
- /// AtomicRMW, if at all. Default is to never expand.
+ /// AtomicRMW, if at all. Default is to never expand scalar atomics and expand
+ /// FP atomics via CmpXChg.
+ ///
+ /// Precondition: \p RMW is not elementwise. Elementwise atomicrmw
+ /// instructions are routed through \c shouldExpandAtomicRMWElementwiseInIR.
virtual AtomicExpansionKind
shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
return RMW->isFloatingPointOperation() ?
AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
}
+ /// Returns whether the IR-level AtomicExpand pass should expand the given
+ /// elementwise AtomicRMW into per-lane scalar atomicrmw instructions.
+ ///
+ /// Returning \c true (the default) tells AtomicExpand to first try to
+ /// conservatively drop the elementwise modifier and reuse an existing
+ /// whole-value atomicrmw lowering. If that is not possible, it scalarizes
+ /// into per-lane scalar atomicrmw instructions that are each fed back
+ /// through the normal atomic expansion pipeline.
+ ///
+ /// Targets that support native vector atomic instructions should return
+ /// \c false to preserve the elementwise atomicrmw for the backend.
+ virtual bool
+ shouldExpandAtomicRMWElementwiseInIR(const AtomicRMWInst *RMW) const {
+ return true;
+ }
+
/// Returns how the given atomic atomicrmw should be cast by the IR-level
/// AtomicExpand pass.
virtual AtomicExpansionKind
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 4ed3d73c4a057..9ebac9b8c79e9 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -1941,13 +1941,15 @@ class IRBuilderBase {
AtomicRMWInst *CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr,
Value *Val, MaybeAlign Align,
AtomicOrdering Ordering,
- SyncScope::ID SSID = SyncScope::System) {
+ SyncScope::ID SSID = SyncScope::System,
+ bool Elementwise = false) {
if (!Align) {
const DataLayout &DL = BB->getDataLayout();
Align = llvm::Align(DL.getTypeStoreSize(Val->getType()));
}
- return Insert(new AtomicRMWInst(Op, Ptr, Val, *Align, Ordering, SSID));
+ return Insert(
+ new AtomicRMWInst(Op, Ptr, Val, *Align, Ordering, SSID, Elementwise));
}
CallInst *CreateStructuredGEP(Type *BaseType, Value *PtrBase,
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index bdc6d5bd2f5e5..9267be115b0cd 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -809,7 +809,7 @@ class AtomicRMWInst : public Instruction {
public:
LLVM_ABI AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
Align Alignment, AtomicOrdering Ordering,
- SyncScope::ID SSID,
+ SyncScope::ID SSID, bool Elementwise = false,
InsertPosition InsertBefore = nullptr);
// allocate space for exactly two operands
@@ -867,6 +867,12 @@ class AtomicRMWInst : public Instruction {
///
void setVolatile(bool V) { setSubclassData<VolatileField>(V); }
+ /// Return true if this RMW has elementwise vector semantics.
+ bool isElementwise() const { return Elementwise; }
+
+ /// Specify whether this RMW has elementwise vector semantics.
+ void setElementwise(bool V) { Elementwise = V; }
+
/// Transparently provide more efficient getOperand methods.
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -920,7 +926,7 @@ class AtomicRMWInst : public Instruction {
private:
void Init(BinOp Operation, Value *Ptr, Value *Val, Align Align,
- AtomicOrdering Ordering, SyncScope::ID SSID);
+ AtomicOrdering Ordering, SyncScope::ID SSID, bool Elementwise);
// Shadow Instruction::setInstructionSubclassData with a private forwarding
// method so that subclasses cannot accidentally use it.
@@ -933,6 +939,9 @@ class AtomicRMWInst : public Instruction {
/// room in SubClassData for everything, so synchronization scope ID gets its
/// own field.
SyncScope::ID SSID;
+
+ /// Whether this instruction uses per-lane vector atomic semantics.
+ bool Elementwise = false;
};
template <>
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 60050ae477a01..7466a381f7fa3 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -600,6 +600,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(unwind);
KEYWORD(datalayout);
KEYWORD(volatile);
+ KEYWORD(elementwise);
KEYWORD(atomic);
KEYWORD(unordered);
KEYWORD(monotonic);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 917aecb47590d..997eea293291d 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8962,20 +8962,24 @@ int LLParser::parseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
}
/// parseAtomicRMW
-/// ::= 'atomicrmw' 'volatile'? BinOp TypeAndValue ',' TypeAndValue
+/// ::= 'atomicrmw' 'volatile'? 'elementwise'? BinOp TypeAndValue ','
+/// TypeAndValue
/// 'singlethread'? AtomicOrdering
int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
bool AteExtraComma = false;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
SyncScope::ID SSID = SyncScope::System;
- bool isVolatile = false;
+ bool IsVolatile = false;
+ bool IsElementwise = false;
bool IsFP = false;
AtomicRMWInst::BinOp Operation;
MaybeAlign Alignment;
if (EatIfPresent(lltok::kw_volatile))
- isVolatile = true;
+ IsVolatile = true;
+ if (EatIfPresent(lltok::kw_elementwise))
+ IsElementwise = true;
switch (Lex.getKind()) {
default:
@@ -9052,42 +9056,54 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
if (Val->getType()->isScalableTy())
return error(ValLoc, "atomicrmw operand may not be scalable");
+ // For elementwise ops, the value must be a fixed vector type whose element
+ // type is legal for the corresponding scalar atomicrmw operation. So assign
+ // ScalarTy the element type for elementwise ops so we can check this.
+ Type *ScalarTy = Val->getType();
+ if (IsElementwise) {
+ auto *VecTy = dyn_cast<FixedVectorType>(Val->getType());
+ if (!VecTy)
+ return error(ValLoc,
+ "atomicrmw elementwise operand must be a fixed vector type");
+ ScalarTy = VecTy->getElementType();
+ }
+
if (Operation == AtomicRMWInst::Xchg) {
- if (!Val->getType()->isIntegerTy() &&
- !Val->getType()->isFloatingPointTy() &&
- !Val->getType()->isPointerTy()) {
+ if (!ScalarTy->isIntegerTy() && !ScalarTy->isFloatingPointTy() &&
+ !ScalarTy->isPointerTy()) {
return error(
ValLoc,
"atomicrmw " + AtomicRMWInst::getOperationName(Operation) +
" operand must be an integer, floating point, or pointer type");
}
} else if (IsFP) {
- if (!Val->getType()->isFPOrFPVectorTy()) {
+ if (!ScalarTy->isFPOrFPVectorTy()) {
return error(ValLoc, "atomicrmw " +
AtomicRMWInst::getOperationName(Operation) +
" operand must be a floating point type");
}
} else {
- if (!Val->getType()->isIntegerTy()) {
+ if (!ScalarTy->isIntegerTy()) {
return error(ValLoc, "atomicrmw " +
AtomicRMWInst::getOperationName(Operation) +
" operand must be an integer");
}
}
- unsigned Size =
- PFS.getFunction().getDataLayout().getTypeStoreSizeInBits(
- Val->getType());
+ // Elementwise ops are legal on <3 x i32>, for example, because we can expand,
+ // so check the scalar type, not the vector type.
+ unsigned Size = PFS.getFunction().getDataLayout().getTypeStoreSizeInBits(
+ IsElementwise ? ScalarTy : Val->getType());
if (Size < 8 || (Size & (Size - 1)))
return error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
" integer");
const Align DefaultAlignment(
PFS.getFunction().getDataLayout().getTypeStoreSize(
Val->getType()));
- AtomicRMWInst *RMWI =
- new AtomicRMWInst(Operation, Ptr, Val,
- Alignment.value_or(DefaultAlignment), Ordering, SSID);
- RMWI->setVolatile(isVolatile);
+ AtomicRMWInst *RMWI = new AtomicRMWInst(Operation, Ptr, Val,
+ Alignment.value_or(DefaultAlignment),
+ Ordering, SSID, IsElementwise);
+ RMWI->setVolatile(IsVolatile);
Inst = RMWI;
return AteExtraComma ? InstExtraComma : InstNormal;
}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index e01cf501b8841..60c8c7812ce81 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1364,8 +1364,10 @@ static int getDecodedBinaryOpcode(unsigned Val, Type *Ty) {
}
}
-static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val) {
- switch (Val) {
+static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val,
+ bool &IsElementwise) {
+ IsElementwise = Val & bitc::RMW_ELEMENTWISE_FLAG;
+ switch (Val & ~bitc::RMW_ELEMENTWISE_FLAG) {
default: return AtomicRMWInst::BAD_BINOP;
case bitc::RMW_XCHG: return AtomicRMWInst::Xchg;
case bitc::RMW_ADD: return AtomicRMWInst::Add;
@@ -6709,8 +6711,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
if (!(NumRecords == (OpNum + 4) || NumRecords == (OpNum + 5)))
return error("Invalid atomicrmw record");
+ bool IsElementwise = false;
const AtomicRMWInst::BinOp Operation =
- getDecodedRMWOperation(Record[OpNum]);
+ getDecodedRMWOperation(Record[OpNum], IsElementwise);
if (Operation < AtomicRMWInst::FIRST_BINOP ||
Operation > AtomicRMWInst::LAST_BINOP)
return error("Invalid atomicrmw record");
@@ -6735,7 +6738,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
Alignment =
Align(TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
- I = new AtomicRMWInst(Operation, Ptr, Val, *Alignment, Ordering, SSID);
+ I = new AtomicRMWInst(Operation, Ptr, Val, *Alignment, Ordering, SSID,
+ IsElementwise);
ResTypeID = ValTypeID;
cast<AtomicRMWInst>(I)->setVolatile(IsVol);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 71d5a8bc98a4b..22052105d4d75 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -690,41 +690,84 @@ static unsigned getEncodedBinaryOpcode(unsigned Opcode) {
}
}
-static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
- switch (Op) {
+static unsigned getEncodedRMWOperation(const AtomicRMWInst &I) {
+ unsigned Encoding = 0;
+ switch (I.getOperation()) {
default: llvm_unreachable("Unknown RMW operation!");
- case AtomicRMWInst::Xchg: return bitc::RMW_XCHG;
- case AtomicRMWInst::Add: return bitc::RMW_ADD;
- case AtomicRMWInst::Sub: return bitc::RMW_SUB;
- case AtomicRMWInst::And: return bitc::RMW_AND;
- case AtomicRMWInst::Nand: return bitc::RMW_NAND;
- case AtomicRMWInst::Or: return bitc::RMW_OR;
- case AtomicRMWInst::Xor: return bitc::RMW_XOR;
- case AtomicRMWInst::Max: return bitc::RMW_MAX;
- case AtomicRMWInst::Min: return bitc::RMW_MIN;
- case AtomicRMWInst::UMax: return bitc::RMW_UMAX;
- case AtomicRMWInst::UMin: return bitc::RMW_UMIN;
- case AtomicRMWInst::FAdd: return bitc::RMW_FADD;
- case AtomicRMWInst::FSub: return bitc::RMW_FSUB;
- case AtomicRMWInst::FMax: return bitc::RMW_FMAX;
- case AtomicRMWInst::FMin: return bitc::RMW_FMIN;
+ case AtomicRMWInst::Xchg:
+ Encoding = bitc::RMW_XCHG;
+ break;
+ case AtomicRMWInst::Add:
+ Encoding = bitc::RMW_ADD;
+ break;
+ case AtomicRMWInst::Sub:
+ Encoding = bitc::RMW_SUB;
+ break;
+ case AtomicRMWInst::And:
+ Encoding = bitc::RMW_AND;
+ break;
+ case AtomicRMWInst::Nand:
+ Encoding = bitc::RMW_NAND;
+ break;
+ case AtomicRMWInst::Or:
+ Encoding = bitc::RMW_OR;
+ break;
+ case AtomicRMWInst::Xor:
+ Encoding = bitc::RMW_XOR;
+ break;
+ case AtomicRMWInst::Max:
+ Encoding = bitc::RMW_MAX;
+ break;
+ case AtomicRMWInst::Min:
+ Encoding = bitc::RMW_MIN;
+ break;
+ case AtomicRMWInst::UMax:
+ Encoding = bitc::RMW_UMAX;
+ break;
+ case AtomicRMWInst::UMin:
+ Encoding = bitc::RMW_UMIN;
+ break;
+ case AtomicRMWInst::FAdd:
+ Encoding = bitc::RMW_FADD;
+ break;
+ case AtomicRMWInst::FSub:
+ Encoding = bitc::RMW_FSUB;
+ break;
+ case AtomicRMWInst::FMax:
+ Encoding = bitc::RMW_FMAX;
+ break;
+ case AtomicRMWInst::FMin:
+ Encoding = bitc::RMW_FMIN;
+ break;
case AtomicRMWInst::FMaximum:
- return bitc::RMW_FMAXIMUM;
+ Encoding = bitc::RMW_FMAXIMUM;
+ break;
case AtomicRMWInst::FMinimum:
- return bitc::RMW_FMINIMUM;
+ Encoding = bitc::RMW_FMINIMUM;
+ break;
case AtomicRMWInst::FMaximumNum:
- return bitc::RMW_FMAXIMUMNUM;
+ Encoding = bitc::RMW_FMAXIMUMNUM;
+ break;
case AtomicRMWInst::FMinimumNum:
- return bitc::RMW_FMINIMUMNUM;
+ Encoding = bitc::RMW_FMINIMUMNUM;
+ break;
case AtomicRMWInst::UIncWrap:
- return bitc::RMW_UINC_WRAP;
+ Encoding = bitc::RMW_UINC_WRAP;
+ break;
case AtomicRMWInst::UDecWrap:
- return bitc::RMW_UDEC_WRAP;
+ Encoding = bitc::RMW_UDEC_WRAP;
+ break;
case AtomicRMWInst::USubCond:
- return bitc::RMW_USUB_COND;
+ Encoding = bitc::RMW_USUB_COND;
+ break;
case AtomicRMWInst::USubSat:
- return bitc::RMW_USUB_SAT;
+ Encoding = bitc::RMW_USUB_SAT;
+ break;
}
+
+ if (I.isElementwise())
+ Encoding |= bitc::RMW_ELEMENTWISE_FLAG;
+ return Encoding;
}
static unsigned getEncodedOrdering(AtomicOrdering Ordering) {
@@ -3547,8 +3590,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
Code = bitc::FUNC_CODE_INST_ATOMICRMW;
pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
pushValueAndType(I.getOperand(1), InstID, Vals); // valty + val
- Vals.push_back(
- getEncodedRMWOperation(cast<AtomicRMWInst>(I).getOperation()));
+ Vals.push_back(getEncodedRMWOperation(cast<AtomicRMWInst>(I)));
Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
Vals.push_back(getEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
Vals.push_back(
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 9585ebae24827..057e43427f1e2 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -43,6 +43,7 @@
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
@@ -86,6 +87,10 @@ class AtomicExpandImpl {
}
bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool tryInsertTrailingSeqCstFence(Instruction *AtomicI);
+ template <typename AtomicInst>
+ bool tryInsertFencesForAtomic(AtomicInst *AtomicI, bool OrderingRequiresFence,
+ AtomicOrdering NewOrdering);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -95,6 +100,8 @@ class AtomicExpandImpl {
bool tryExpandAtomicStore(StoreInst *SI);
void expandAtomicStoreToXChg(StoreInst *SI);
bool tryExpandAtomicRMW(AtomicRMWInst *AI);
+ bool canReuseWholeValueAtomicRMW(AtomicRMWInst *AI);
+ bool expandElementwiseAtomicRMW(AtomicRMWInst *AI);
AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
Value *
insertRMWLLSCLoop(IRBuilderBase &Builder, Type *ResultTy, Value *Addr,
@@ -261,16 +268,53 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
}
-bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
- auto *LI = dyn_cast<LoadInst>(I);
- auto *SI = dyn_cast<StoreInst>(I);
- auto *RMWI = dyn_cast<AtomicRMWInst>(I);
- auto *CASI = dyn_cast<AtomicCmpXchgInst>(I);
+/// Returns true if we can lower atomicrmw elementwise using normal atomicrmw.
+bool AtomicExpandImpl::canReuseWholeValueAtomicRMW(AtomicRMWInst *AI) {
+ assert(AI->isElementwise() && "expected elementwise atomicrmw");
- bool MadeChange = false;
+ // Integer non-elementwise vector atomicrmw is illegal IR, so we need to be
+ // careful to reject these before removing the elementwise modifier.
+ if (!AI->isFloatingPointOperation())
+ return false;
+
+ AI->setElementwise(false);
+ bool CanReuse = atomicSizeSupported(TLI, AI) &&
+ TLI->shouldExpandAtomicRMWInIR(AI) ==
+ TargetLoweringBase::AtomicExpansionKind::None;
+ AI->setElementwise(true);
+ return CanReuse;
+}
- // If the Size/Alignment is not supported, replace with a libcall.
- if (LI) {
+bool AtomicExpandImpl::tryInsertTrailingSeqCstFence(Instruction *AtomicI) {
+ if (!TLI->shouldInsertTrailingSeqCstFenceForAtomicStore(AtomicI))
+ return false;
+
+ IRBuilder Builder(AtomicI);
+ if (auto *TrailingFence = TLI->emitTrailingFence(
+ Builder, AtomicI, AtomicOrdering::SequentiallyConsistent)) {
+ TrailingFence->moveAfter(AtomicI);
+ return true;
+ }
+ return false;
+}
+
+template <typename AtomicInst>
+bool AtomicExpandImpl::tryInsertFencesForAtomic(AtomicInst *AtomicI,
+ bool OrderingRequiresFence,
+ AtomicOrdering NewOrdering) {
+ bool ShouldInsertFences = TLI->shouldInsertFencesForAtomic(AtomicI);
+ if (OrderingRequiresFence && ShouldInsertFences) {
+ AtomicOrdering FenceOrdering = AtomicI->getOrdering();
+ AtomicI->setOrdering(NewOrdering);
+ return bracketInstWithFences(AtomicI, FenceOrdering);
+ }
+ if (!ShouldInsertFences)
+ return tryInsertTrailingSeqCstFence(AtomicI);
+ return false;
+}
+
+bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
if (!LI->isAtomic())
return false;
@@ -279,12 +323,21 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
return true;
}
+ bool MadeChange = false;
if (TLI->shouldCastAtomicLoadInIR(LI) ==
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
- I = LI = convertAtomicLoadToIntegerType(LI);
+ LI = convertAtomicLoadToIntegerType(LI);
MadeChange = true;
}
- } else if (SI) {
+
+ MadeChange |= tryInsertFencesForAtomic(
+ LI, isAcquireOrStronger(LI->getOrdering()), AtomicOrdering::Monotonic);
+
+ MadeChange |= tryExpandAtomicLoad(LI);
+ return MadeChange;
+ }
+
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
if (!SI->isAtomic())
return false;
@@ -293,23 +346,66 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
return true;
}
+ bool MadeChange = false;
if (TLI->shouldCastAtomicStoreInIR(SI) ==
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
- I = SI = convertAtomicStoreToIntegerType(SI);
+ SI = convertAtomicStoreToIntegerType(SI);
MadeChange = true;
}
- } else if (RMWI) {
+
+ MadeChange |= tryInsertFencesForAtomic(
+ SI, isReleaseOrStronger(SI->getOrdering()), AtomicOrdering::Monotonic);
+
+ MadeChange |= tryExpandAtomicStore(SI);
+ return MadeChange;
+ }
+
+ if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
if (!atomicSizeSupported(TLI, RMWI)) {
+ if (RMWI->isElementwise())
+ return expandElementwiseAtomicRMW(RMWI);
expandAtomicRMWToLibcall(RMWI);
return true;
}
+ bool MadeChange = false;
+
+ if (RMWI->isElementwise()) {
+ if (!TLI->shouldExpandAtomicRMWElementwiseInIR(RMWI))
+ return false;
+ if (canReuseWholeValueAtomicRMW(RMWI)) {
+ // Dropping the elementwise modifier strengthens the semantics, which is
+ // conservatively correct. Prefer the target's existing whole-value
+ // lowering over IR expansion.
+ RMWI->setElementwise(false);
+ MadeChange = true;
+ } else {
+ return expandElementwiseAtomicRMW(RMWI);
+ }
+ }
+
if (TLI->shouldCastAtomicRMWIInIR(RMWI) ==
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
- I = RMWI = convertAtomicXchgToIntegerType(RMWI);
+ RMWI = convertAtomicXchgToIntegerType(RMWI);
MadeChange = true;
}
- } else if (CASI) {
+
+ MadeChange |= tryInsertFencesForAtomic(
+ RMWI,
+ isReleaseOrStronger(RMWI->getOrdering()) ||
+ isAcquireOrStronger(RMWI->getOrdering()),
+ TLI->atomicOperationOrderAfterFenceSplit(RMWI));
+
+ // There are two different ways of expanding RMW instructions:
+ // - into a load if it is idempotent
+ // - into a Cmpxchg/LL-SC loop otherwise
+ // we try them in that order.
+ MadeChange |= (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) ||
+ tryExpandAtomicRMW(RMWI);
+ return MadeChange;
+ }
+
+ if (auto *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
if (!atomicSizeSupported(TLI, CASI)) {
expandAtomicCASToLibcall(CASI);
return true;
@@ -317,79 +413,42 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// TODO: when we're ready to make the change at the IR level, we can
// extend convertCmpXchgToInteger for floating point too.
+ bool MadeChange = false;
if (CASI->getCompareOperand()->getType()->isPointerTy()) {
// TODO: add a TLI hook to control this so that each target can
// convert to lowering the original type one at a time.
- I = CASI = convertCmpXchgToIntegerType(CASI);
+ CASI = convertCmpXchgToIntegerType(CASI);
MadeChange = true;
}
- } else
- return false;
- if (TLI->shouldInsertFencesForAtomic(I)) {
- auto FenceOrdering = AtomicOrdering::Monotonic;
- if (LI && isAcquireOrStronger(LI->getOrdering())) {
- FenceOrdering = LI->getOrdering();
- LI->setOrdering(AtomicOrdering::Monotonic);
- } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
- FenceOrdering = SI->getOrdering();
- SI->setOrdering(AtomicOrdering::Monotonic);
- } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
- isAcquireOrStronger(RMWI->getOrdering()))) {
- FenceOrdering = RMWI->getOrdering();
- RMWI->setOrdering(TLI->atomicOperationOrderAfterFenceSplit(RMWI));
- } else if (CASI &&
- TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
- TargetLoweringBase::AtomicExpansionKind::None &&
- (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
- isAcquireOrStronger(CASI->getSuccessOrdering()) ||
- isAcquireOrStronger(CASI->getFailureOrdering()))) {
- // If a compare and swap is lowered to LL/SC, we can do smarter fence
- // insertion, with a stronger one on the success path than on the
- // failure path. As a result, fence insertion is directly done by
- // expandAtomicCmpXchg in that case.
- FenceOrdering = CASI->getMergedOrdering();
- auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
-
- CASI->setSuccessOrdering(CASOrdering);
- CASI->setFailureOrdering(CASOrdering);
- }
-
- if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
- }
- } else if (TLI->shouldInsertTrailingSeqCstFenceForAtomicStore(I) &&
- !(CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
- TargetLoweringBase::AtomicExpansionKind::LLSC)) {
- // CmpXchg LLSC is handled in expandAtomicCmpXchg().
- IRBuilder Builder(I);
- if (auto TrailingFence = TLI->emitTrailingFence(
- Builder, I, AtomicOrdering::SequentiallyConsistent)) {
- TrailingFence->moveAfter(I);
- MadeChange = true;
+ auto CmpXchgExpansion = TLI->shouldExpandAtomicCmpXchgInIR(CASI);
+ if (TLI->shouldInsertFencesForAtomic(CASI)) {
+ if (CmpXchgExpansion == TargetLoweringBase::AtomicExpansionKind::None &&
+ (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
+ isAcquireOrStronger(CASI->getSuccessOrdering()) ||
+ isAcquireOrStronger(CASI->getFailureOrdering()))) {
+ // If a compare and swap is lowered to LL/SC, we can do smarter fence
+ // insertion, with a stronger one on the success path than on the
+ // failure path. As a result, fence insertion is directly done by
+ // expandAtomicCmpXchg in that case.
+ AtomicOrdering FenceOrdering = CASI->getMergedOrdering();
+ AtomicOrdering CASOrdering =
+ TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ CASI->setSuccessOrdering(CASOrdering);
+ CASI->setFailureOrdering(CASOrdering);
+ MadeChange |= bracketInstWithFences(CASI, FenceOrdering);
+ }
+ } else if (CmpXchgExpansion !=
+ TargetLoweringBase::AtomicExpansionKind::LLSC) {
+ // CmpXchg LLSC is handled in expandAtomicCmpXchg().
+ MadeChange |= tryInsertTrailingSeqCstFence(CASI);
}
- }
-
- if (LI)
- MadeChange |= tryExpandAtomicLoad(LI);
- else if (SI)
- MadeChange |= tryExpandAtomicStore(SI);
- else if (RMWI) {
- // There are two different ways of expanding RMW instructions:
- // - into a load if it is idempotent
- // - into a Cmpxchg/LL-SC loop otherwise
- // we try them in that order.
- if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
- MadeChange = true;
-
- } else {
- MadeChange |= tryExpandAtomicRMW(RMWI);
- }
- } else if (CASI)
MadeChange |= tryExpandAtomicCmpXchg(CASI);
+ return MadeChange;
+ }
- return MadeChange;
+ return false;
}
bool AtomicExpandImpl::run(
@@ -543,6 +602,45 @@ AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
return NewRMWI;
}
+// Scalarize an elementwise vector atomicrmw into one scalar atomicrmw per
+// lane. Each lane keeps the original ordering/scope/volatility and is fed back
+// through processAtomicInstr() so the usual atomic expansion and fence logic
+// applies per lane rather than to the whole vector value.
+bool AtomicExpandImpl::expandElementwiseAtomicRMW(AtomicRMWInst *AI) {
+ auto *VecTy = cast<FixedVectorType>(AI->getType());
+ Type *LaneTy = VecTy->getElementType();
+ LLVMContext &Ctx = AI->getContext();
+ Value *Result = Constant::getNullValue(VecTy);
+ const uint64_t LaneSize = DL->getTypeStoreSize(LaneTy).getFixedValue();
+
+ for (unsigned Lane = 0, NumLanes = VecTy->getNumElements(); Lane != NumLanes;
+ ++Lane) {
+ ReplacementIRBuilder Builder(AI, *DL);
+ Value *Idx0 = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
+ Value *Idx = ConstantInt::get(Type::getInt64Ty(Ctx), Lane);
+ Value *Indices[] = {Idx0, Idx};
+ Value *LanePtr = Builder.CreateInBoundsGEP(VecTy, AI->getPointerOperand(),
+ Indices, "lane.ptr");
+ Value *LaneVal =
+ Builder.CreateExtractElement(AI->getValOperand(), Idx, "lane.val");
+ auto *LaneRMW = Builder.CreateAtomicRMW(
+ AI->getOperation(), LanePtr, LaneVal,
+ commonAlignment(AI->getAlign(), LaneSize * Lane), AI->getOrdering(),
+ AI->getSyncScopeID());
+ LaneRMW->setVolatile(AI->isVolatile());
+ copyMetadataForAtomic(*LaneRMW, *AI);
+ Result = Builder.CreateInsertElement(Result, LaneRMW, Idx, "lane.old");
+
+ // Each scalar lane atomic may still need casts, fences, or further
+ // expansion, so re-run the normal atomic pipeline on it.
+ processAtomicInstr(LaneRMW);
+ }
+
+ AI->replaceAllUsesWith(Result);
+ AI->eraseFromParent();
+ return true;
+}
+
bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
case TargetLoweringBase::AtomicExpansionKind::None:
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 7bff1e307237c..f8a421ba77882 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4468,8 +4468,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
Out << ' ' << CI->getPredicate();
// Print out the atomicrmw operation
- if (const auto *RMWI = dyn_cast<AtomicRMWInst>(&I))
+ if (const auto *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
+ if (RMWI->isElementwise())
+ Out << " elementwise";
Out << ' ' << AtomicRMWInst::getOperationName(RMWI->getOperation());
+ }
// Print out the type of the operands...
const Value *Operand = I.getNumOperands() ? I.getOperand(0) : nullptr;
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 5205d36a228c1..db2d959e8eb87 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -955,6 +955,7 @@ bool Instruction::hasSameSpecialState(const Instruction *I2,
cast<AtomicCmpXchgInst>(I2)->getSyncScopeID();
if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+ RMWI->isElementwise() == cast<AtomicRMWInst>(I2)->isElementwise() &&
RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
(RMWI->getAlign() == cast<AtomicRMWInst>(I2)->getAlign() ||
IgnoreAlignment) &&
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 8a220c48acac8..f940893ab0296 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1436,7 +1436,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
Align Alignment, AtomicOrdering Ordering,
- SyncScope::ID SSID) {
+ SyncScope::ID SSID, bool Elementwise) {
assert(Ordering != AtomicOrdering::NotAtomic &&
"atomicrmw instructions can only be atomic.");
assert(Ordering != AtomicOrdering::Unordered &&
@@ -1446,6 +1446,7 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
setOperation(Operation);
setOrdering(Ordering);
setSyncScopeID(SSID);
+ setElementwise(Elementwise);
setAlignment(Alignment);
assert(getOperand(0) && getOperand(1) && "All operands must be non-null!");
@@ -1457,9 +1458,10 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
Align Alignment, AtomicOrdering Ordering,
- SyncScope::ID SSID, InsertPosition InsertBefore)
+ SyncScope::ID SSID, bool Elementwise,
+ InsertPosition InsertBefore)
: Instruction(Val->getType(), AtomicRMW, AllocMarker, InsertBefore) {
- Init(Operation, Ptr, Val, Alignment, Ordering, SSID);
+ Init(Operation, Ptr, Val, Alignment, Ordering, SSID, Elementwise);
}
StringRef AtomicRMWInst::getOperationName(BinOp Op) {
@@ -4397,9 +4399,9 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
}
AtomicRMWInst *AtomicRMWInst::cloneImpl() const {
- AtomicRMWInst *Result =
- new AtomicRMWInst(getOperation(), getOperand(0), getOperand(1),
- getAlign(), getOrdering(), getSyncScopeID());
+ AtomicRMWInst *Result = new AtomicRMWInst(
+ getOperation(), getOperand(0), getOperand(1), getAlign(), getOrdering(),
+ getSyncScopeID(), isElementwise());
Result->setVolatile(isVolatile());
return Result;
}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f4d4f81c12124..ac087552c3461 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4762,25 +4762,35 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
"atomicrmw instructions cannot be unordered.", &RMWI);
auto Op = RMWI.getOperation();
Type *ElTy = RMWI.getOperand(1)->getType();
+ Type *ScalarTy = ElTy;
+ if (RMWI.isElementwise()) {
+ auto *VecTy = dyn_cast<FixedVectorType>(ElTy);
+ Check(VecTy, "atomicrmw elementwise operand must have fixed vector type!",
+ &RMWI, ElTy);
+ if (VecTy)
+ ScalarTy = VecTy->getElementType();
+ }
+
if (Op == AtomicRMWInst::Xchg) {
- Check(ElTy->isIntegerTy() || ElTy->isFloatingPointTy() ||
- ElTy->isPointerTy(),
+ Check(ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
+ ScalarTy->isPointerTy(),
"atomicrmw " + AtomicRMWInst::getOperationName(Op) +
" operand must have integer or floating point type!",
&RMWI, ElTy);
} else if (AtomicRMWInst::isFPOperation(Op)) {
Check(ElTy->isFPOrFPVectorTy() && !isa<ScalableVectorType>(ElTy),
"atomicrmw " + AtomicRMWInst::getOperationName(Op) +
- " operand must have floating-point or fixed vector of floating-point "
+ " operand must have floating-point or fixed vector of "
+ "floating-point "
"type!",
&RMWI, ElTy);
} else {
- Check(ElTy->isIntegerTy(),
+ Check(ScalarTy->isIntegerTy(),
"atomicrmw " + AtomicRMWInst::getOperationName(Op) +
" operand must have integer type!",
&RMWI, ElTy);
}
- checkAtomicMemAccessSize(ElTy, &RMWI);
+ checkAtomicMemAccessSize(RMWI.isElementwise() ? ScalarTy : ElTy, &RMWI);
Check(AtomicRMWInst::FIRST_BINOP <= Op && Op <= AtomicRMWInst::LAST_BINOP,
"Invalid binary operation!", &RMWI);
visitInstruction(RMWI);
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index 0ed34f0ad98ef..611a717fa9a8d 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -151,5 +151,11 @@ define void @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
; CHECK: %atomic.fminimumnum = atomicrmw fminimumnum ptr %x, <2 x half> %val seq_cst
%atomic.fminimumnum = atomicrmw fminimumnum ptr %x, <2 x half> %val seq_cst
+ ; CHECK: %atomic.elem.fadd = atomicrmw elementwise fadd ptr %x, <2 x half> %val monotonic
+ %atomic.elem.fadd = atomicrmw elementwise fadd ptr %x, <2 x half> %val monotonic
+
+ ; CHECK: %atomic.elem.fadd.vol = atomicrmw volatile elementwise fadd ptr %x, <2 x half> %val seq_cst
+ %atomic.elem.fadd.vol = atomicrmw volatile elementwise fadd ptr %x, <2 x half> %val seq_cst
+
ret void
}
diff --git a/llvm/test/Assembler/invalid-atomicrmw-elementwise-fadd-must-be-fp-type.ll b/llvm/test/Assembler/invalid-atomicrmw-elementwise-fadd-must-be-fp-type.ll
new file mode 100644
index 0000000000000..65a24212c20a5
--- /dev/null
+++ b/llvm/test/Assembler/invalid-atomicrmw-elementwise-fadd-must-be-fp-type.ll
@@ -0,0 +1,8 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+define <4 x i32> @bad_fadd(ptr %p, <4 x i32> %v) {
+ %old = atomicrmw elementwise fadd ptr %p, <4 x i32> %v monotonic
+ ret <4 x i32> %old
+}
+
+; CHECK: atomicrmw fadd operand must be a floating point type
diff --git a/llvm/test/Assembler/invalid-atomicrmw-elementwise-scalar.ll b/llvm/test/Assembler/invalid-atomicrmw-elementwise-scalar.ll
new file mode 100644
index 0000000000000..6037607a52eef
--- /dev/null
+++ b/llvm/test/Assembler/invalid-atomicrmw-elementwise-scalar.ll
@@ -0,0 +1,8 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+define i32 @bad_scalar(ptr %p, i32 %v) {
+ %old = atomicrmw elementwise add ptr %p, i32 %v monotonic
+ ret i32 %old
+}
+
+; CHECK: atomicrmw elementwise operand must be a fixed vector type
diff --git a/llvm/test/Bitcode/atomicrmw-elementwise.ll b/llvm/test/Bitcode/atomicrmw-elementwise.ll
new file mode 100644
index 0000000000000..db9c48a80047e
--- /dev/null
+++ b/llvm/test/Bitcode/atomicrmw-elementwise.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+; RUN: llvm-as %s -o - | verify-uselistorder
+
+define <4 x i32> @elem_add(ptr %p, <4 x i32> %v) {
+; CHECK-LABEL: @elem_add(
+; CHECK: %old = atomicrmw elementwise add ptr %p, <4 x i32> %v monotonic, align 16
+ %old = atomicrmw elementwise add ptr %p, <4 x i32> %v monotonic
+ ret <4 x i32> %old
+}
+
+define <4 x float> @elem_fadd(ptr %p, <4 x float> %v) {
+; CHECK-LABEL: @elem_fadd(
+; CHECK: %old = atomicrmw elementwise fadd ptr %p, <4 x float> %v seq_cst, align 16
+ %old = atomicrmw elementwise fadd ptr %p, <4 x float> %v seq_cst
+ ret <4 x float> %old
+}
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-elementwise.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-elementwise.ll
new file mode 100644
index 0000000000000..625c2634dd584
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-elementwise.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx84 | FileCheck %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx84 | %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @fadd_v2f32_elementwise(ptr %addr, <2 x float> %val) {
+; CHECK-LABEL: fadd_v2f32_elementwise(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [fadd_v2f32_elementwise_param_0];
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [fadd_v2f32_elementwise_param_1];
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r3, [%rd1], %r1;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r4, [%rd1+4], %r2;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r4};
+; CHECK-NEXT: ret;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <2 x float> %val monotonic
+ ret <2 x float> %old
+}
+
+define <4 x float> @fadd_v4f32_elementwise(ptr %addr, <4 x float> %val) {
+; CHECK-LABEL: fadd_v4f32_elementwise(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [fadd_v4f32_elementwise_param_0];
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [fadd_v4f32_elementwise_param_1];
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r5, [%rd1], %r1;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r6, [%rd1+4], %r2;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r7, [%rd1+8], %r3;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r8, [%rd1+12], %r4;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: ret;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <4 x float> %val monotonic, align 16
+ ret <4 x float> %old
+}
+
+define <4 x float> @fadd_v4f32_elementwise_volatile(ptr %addr,
+; CHECK-LABEL: fadd_v4f32_elementwise_volatile(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [fadd_v4f32_elementwise_volatile_param_0];
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [fadd_v4f32_elementwise_volatile_param_1];
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r5, [%rd1], %r1;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r6, [%rd1+4], %r2;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r7, [%rd1+8], %r3;
+; CHECK-NEXT: atom.relaxed.sys.add.f32 %r8, [%rd1+12], %r4;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: ret;
+ <4 x float> %val) {
+entry:
+ %old = atomicrmw volatile elementwise fadd ptr %addr, <4 x float> %val monotonic, align 16
+ ret <4 x float> %old
+}
+
+define <4 x float> @fadd_v4f32_elementwise_block(ptr %addr, <4 x float> %val) {
+; CHECK-LABEL: fadd_v4f32_elementwise_block(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [fadd_v4f32_elementwise_block_param_0];
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [fadd_v4f32_elementwise_block_param_1];
+; CHECK-NEXT: atom.relaxed.cta.add.f32 %r5, [%rd1], %r1;
+; CHECK-NEXT: atom.relaxed.cta.add.f32 %r6, [%rd1+4], %r2;
+; CHECK-NEXT: atom.relaxed.cta.add.f32 %r7, [%rd1+8], %r3;
+; CHECK-NEXT: atom.relaxed.cta.add.f32 %r8, [%rd1+12], %r4;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: ret;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <4 x float> %val syncscope("block") monotonic, align 16
+ ret <4 x float> %old
+}
+
+define <4 x float> @fadd_v4f32_elementwise_seq_cst(ptr %addr, <4 x float> %val) {
+; CHECK-LABEL: fadd_v4f32_elementwise_seq_cst(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [fadd_v4f32_elementwise_seq_cst_param_0];
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [fadd_v4f32_elementwise_seq_cst_param_1];
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.sys.add.f32 %r5, [%rd1], %r1;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.sys.add.f32 %r6, [%rd1+4], %r2;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.sys.add.f32 %r7, [%rd1+8], %r3;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.sys.add.f32 %r8, [%rd1+12], %r4;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: ret;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <4 x float> %val seq_cst, align 16
+ ret <4 x float> %old
+}
+
+define <8 x half> @fmin_v8f16_elementwise(ptr %addr, <8 x half> %val) {
+; CHECK-LABEL: fmin_v8f16_elementwise(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<9>;
+; CHECK-NEXT: .reg .b16 %rs<41>;
+; CHECK-NEXT: .reg .b32 %r<90>;
+; CHECK-NEXT: .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [fmin_v8f16_elementwise_param_1];
+; CHECK-NEXT: ld.param.b64 %rd5, [fmin_v8f16_elementwise_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, _}, %r25;
+; CHECK-NEXT: ld.b32 %r82, [%rd5];
+; CHECK-NEXT: $L__BB5_1: // %atomicrmw.start
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r82;
+; CHECK-NEXT: min.f16 %rs10, %rs9, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r29, %rs10;
+; CHECK-NEXT: and.b32 %r30, %r82, -65536;
+; CHECK-NEXT: or.b32 %r31, %r30, %r29;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r1, [%rd5], %r82, %r31;
+; CHECK-NEXT: setp.ne.b32 %p1, %r1, %r82;
+; CHECK-NEXT: mov.b32 %r82, %r1;
+; CHECK-NEXT: @%p1 bra $L__BB5_1;
+; CHECK-NEXT: // %bb.2: // %atomicrmw.end
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r1;
+; CHECK-NEXT: mov.b16 %rs12, 0x0000;
+; CHECK-NEXT: mov.b32 %r2, {%rs11, %rs12};
+; CHECK-NEXT: mov.b32 %r13, 0;
+; CHECK-NEXT: add.s64 %rd6, %rd5, 2;
+; CHECK-NEXT: mov.b32 {_, %rs2}, %r25;
+; CHECK-NEXT: and.b64 %rd1, %rd6, -4;
+; CHECK-NEXT: cvt.u32.u64 %r32, %rd6;
+; CHECK-NEXT: and.b32 %r33, %r32, 3;
+; CHECK-NEXT: shl.b32 %r3, %r33, 3;
+; CHECK-NEXT: mov.b32 %r34, 65535;
+; CHECK-NEXT: shl.b32 %r35, %r34, %r3;
+; CHECK-NEXT: not.b32 %r4, %r35;
+; CHECK-NEXT: ld.b32 %r83, [%rd1];
+; CHECK-NEXT: $L__BB5_3: // %atomicrmw.start5
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r36, %r83, %r3;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r36;
+; CHECK-NEXT: min.f16 %rs14, %rs13, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r37, %rs14;
+; CHECK-NEXT: shl.b32 %r38, %r37, %r3;
+; CHECK-NEXT: and.b32 %r39, %r83, %r4;
+; CHECK-NEXT: or.b32 %r40, %r39, %r38;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r83, %r40;
+; CHECK-NEXT: setp.ne.b32 %p2, %r5, %r83;
+; CHECK-NEXT: mov.b32 %r83, %r5;
+; CHECK-NEXT: @%p2 bra $L__BB5_3;
+; CHECK-NEXT: // %bb.4: // %atomicrmw.end4
+; CHECK-NEXT: shr.u32 %r41, %r5, %r3;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r41;
+; CHECK-NEXT: mov.b32 {%rs16, _}, %r2;
+; CHECK-NEXT: mov.b32 %r18, {%rs16, %rs15};
+; CHECK-NEXT: mov.b32 {%rs3, _}, %r26;
+; CHECK-NEXT: ld.b32 %r84, [%rd5+4];
+; CHECK-NEXT: $L__BB5_5: // %atomicrmw.start20
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r84;
+; CHECK-NEXT: min.f16 %rs18, %rs17, %rs3;
+; CHECK-NEXT: cvt.u32.u16 %r42, %rs18;
+; CHECK-NEXT: and.b32 %r43, %r84, -65536;
+; CHECK-NEXT: or.b32 %r44, %r43, %r42;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r6, [%rd5+4], %r84, %r44;
+; CHECK-NEXT: setp.ne.b32 %p3, %r6, %r84;
+; CHECK-NEXT: mov.b32 %r84, %r6;
+; CHECK-NEXT: @%p3 bra $L__BB5_5;
+; CHECK-NEXT: // %bb.6: // %atomicrmw.end19
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r6;
+; CHECK-NEXT: mov.b32 {_, %rs20}, %r13;
+; CHECK-NEXT: mov.b32 %r7, {%rs19, %rs20};
+; CHECK-NEXT: add.s64 %rd7, %rd5, 6;
+; CHECK-NEXT: mov.b32 {_, %rs4}, %r26;
+; CHECK-NEXT: and.b64 %rd2, %rd7, -4;
+; CHECK-NEXT: cvt.u32.u64 %r45, %rd7;
+; CHECK-NEXT: and.b32 %r46, %r45, 3;
+; CHECK-NEXT: shl.b32 %r8, %r46, 3;
+; CHECK-NEXT: mov.b32 %r47, 65535;
+; CHECK-NEXT: shl.b32 %r48, %r47, %r8;
+; CHECK-NEXT: not.b32 %r9, %r48;
+; CHECK-NEXT: ld.b32 %r85, [%rd2];
+; CHECK-NEXT: $L__BB5_7: // %atomicrmw.start38
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r49, %r85, %r8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r49;
+; CHECK-NEXT: min.f16 %rs22, %rs21, %rs4;
+; CHECK-NEXT: cvt.u32.u16 %r50, %rs22;
+; CHECK-NEXT: shl.b32 %r51, %r50, %r8;
+; CHECK-NEXT: and.b32 %r52, %r85, %r9;
+; CHECK-NEXT: or.b32 %r53, %r52, %r51;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r10, [%rd2], %r85, %r53;
+; CHECK-NEXT: setp.ne.b32 %p4, %r10, %r85;
+; CHECK-NEXT: mov.b32 %r85, %r10;
+; CHECK-NEXT: @%p4 bra $L__BB5_7;
+; CHECK-NEXT: // %bb.8: // %atomicrmw.end37
+; CHECK-NEXT: shr.u32 %r54, %r10, %r8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r54;
+; CHECK-NEXT: mov.b32 {%rs24, _}, %r7;
+; CHECK-NEXT: mov.b32 %r19, {%rs24, %rs23};
+; CHECK-NEXT: mov.b32 {%rs5, _}, %r27;
+; CHECK-NEXT: ld.b32 %r86, [%rd5+8];
+; CHECK-NEXT: $L__BB5_9: // %atomicrmw.start54
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r86;
+; CHECK-NEXT: min.f16 %rs26, %rs25, %rs5;
+; CHECK-NEXT: cvt.u32.u16 %r55, %rs26;
+; CHECK-NEXT: and.b32 %r56, %r86, -65536;
+; CHECK-NEXT: or.b32 %r57, %r56, %r55;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r11, [%rd5+8], %r86, %r57;
+; CHECK-NEXT: setp.ne.b32 %p5, %r11, %r86;
+; CHECK-NEXT: mov.b32 %r86, %r11;
+; CHECK-NEXT: @%p5 bra $L__BB5_9;
+; CHECK-NEXT: // %bb.10: // %atomicrmw.end53
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r11;
+; CHECK-NEXT: mov.b32 {_, %rs28}, %r13;
+; CHECK-NEXT: mov.b32 %r12, {%rs27, %rs28};
+; CHECK-NEXT: add.s64 %rd8, %rd5, 10;
+; CHECK-NEXT: mov.b32 {_, %rs6}, %r27;
+; CHECK-NEXT: and.b64 %rd3, %rd8, -4;
+; CHECK-NEXT: cvt.u32.u64 %r58, %rd8;
+; CHECK-NEXT: and.b32 %r59, %r58, 3;
+; CHECK-NEXT: shl.b32 %r14, %r59, 3;
+; CHECK-NEXT: mov.b32 %r60, 65535;
+; CHECK-NEXT: shl.b32 %r61, %r60, %r14;
+; CHECK-NEXT: not.b32 %r15, %r61;
+; CHECK-NEXT: ld.b32 %r87, [%rd3];
+; CHECK-NEXT: $L__BB5_11: // %atomicrmw.start72
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r62, %r87, %r14;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r62;
+; CHECK-NEXT: min.f16 %rs30, %rs29, %rs6;
+; CHECK-NEXT: cvt.u32.u16 %r63, %rs30;
+; CHECK-NEXT: shl.b32 %r64, %r63, %r14;
+; CHECK-NEXT: and.b32 %r65, %r87, %r15;
+; CHECK-NEXT: or.b32 %r66, %r65, %r64;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r16, [%rd3], %r87, %r66;
+; CHECK-NEXT: setp.ne.b32 %p6, %r16, %r87;
+; CHECK-NEXT: mov.b32 %r87, %r16;
+; CHECK-NEXT: @%p6 bra $L__BB5_11;
+; CHECK-NEXT: // %bb.12: // %atomicrmw.end71
+; CHECK-NEXT: shr.u32 %r67, %r16, %r14;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r67;
+; CHECK-NEXT: mov.b32 {%rs32, _}, %r12;
+; CHECK-NEXT: mov.b32 %r20, {%rs32, %rs31};
+; CHECK-NEXT: mov.b32 {%rs7, _}, %r28;
+; CHECK-NEXT: ld.b32 %r88, [%rd5+12];
+; CHECK-NEXT: $L__BB5_13: // %atomicrmw.start88
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs33, %r88;
+; CHECK-NEXT: min.f16 %rs34, %rs33, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r68, %rs34;
+; CHECK-NEXT: and.b32 %r69, %r88, -65536;
+; CHECK-NEXT: or.b32 %r70, %r69, %r68;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r17, [%rd5+12], %r88, %r70;
+; CHECK-NEXT: setp.ne.b32 %p7, %r17, %r88;
+; CHECK-NEXT: mov.b32 %r88, %r17;
+; CHECK-NEXT: @%p7 bra $L__BB5_13;
+; CHECK-NEXT: // %bb.14: // %atomicrmw.end87
+; CHECK-NEXT: cvt.u16.u32 %rs35, %r17;
+; CHECK-NEXT: mov.b32 {_, %rs36}, %r13;
+; CHECK-NEXT: mov.b32 %r21, {%rs35, %rs36};
+; CHECK-NEXT: add.s64 %rd9, %rd5, 14;
+; CHECK-NEXT: mov.b32 {_, %rs8}, %r28;
+; CHECK-NEXT: and.b64 %rd4, %rd9, -4;
+; CHECK-NEXT: cvt.u32.u64 %r71, %rd9;
+; CHECK-NEXT: and.b32 %r72, %r71, 3;
+; CHECK-NEXT: shl.b32 %r22, %r72, 3;
+; CHECK-NEXT: mov.b32 %r73, 65535;
+; CHECK-NEXT: shl.b32 %r74, %r73, %r22;
+; CHECK-NEXT: not.b32 %r23, %r74;
+; CHECK-NEXT: ld.b32 %r89, [%rd4];
+; CHECK-NEXT: $L__BB5_15: // %atomicrmw.start106
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r75, %r89, %r22;
+; CHECK-NEXT: cvt.u16.u32 %rs37, %r75;
+; CHECK-NEXT: min.f16 %rs38, %rs37, %rs8;
+; CHECK-NEXT: cvt.u32.u16 %r76, %rs38;
+; CHECK-NEXT: shl.b32 %r77, %r76, %r22;
+; CHECK-NEXT: and.b32 %r78, %r89, %r23;
+; CHECK-NEXT: or.b32 %r79, %r78, %r77;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r24, [%rd4], %r89, %r79;
+; CHECK-NEXT: setp.ne.b32 %p8, %r24, %r89;
+; CHECK-NEXT: mov.b32 %r89, %r24;
+; CHECK-NEXT: @%p8 bra $L__BB5_15;
+; CHECK-NEXT: // %bb.16: // %atomicrmw.end105
+; CHECK-NEXT: shr.u32 %r80, %r24, %r22;
+; CHECK-NEXT: cvt.u16.u32 %rs39, %r80;
+; CHECK-NEXT: mov.b32 {%rs40, _}, %r21;
+; CHECK-NEXT: mov.b32 %r81, {%rs40, %rs39};
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r18, %r19, %r20, %r81};
+; CHECK-NEXT: ret;
+entry:
+ %old = atomicrmw elementwise fmin ptr %addr, <8 x half> %val monotonic, align 16
+ ret <8 x half> %old
+}
+
+define <8 x half> @fmax_v8f16_elementwise(ptr %addr, <8 x half> %val) {
+; CHECK-LABEL: fmax_v8f16_elementwise(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<9>;
+; CHECK-NEXT: .reg .b16 %rs<41>;
+; CHECK-NEXT: .reg .b32 %r<90>;
+; CHECK-NEXT: .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [fmax_v8f16_elementwise_param_1];
+; CHECK-NEXT: ld.param.b64 %rd5, [fmax_v8f16_elementwise_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, _}, %r25;
+; CHECK-NEXT: ld.b32 %r82, [%rd5];
+; CHECK-NEXT: $L__BB6_1: // %atomicrmw.start
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r82;
+; CHECK-NEXT: max.f16 %rs10, %rs9, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r29, %rs10;
+; CHECK-NEXT: and.b32 %r30, %r82, -65536;
+; CHECK-NEXT: or.b32 %r31, %r30, %r29;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r1, [%rd5], %r82, %r31;
+; CHECK-NEXT: setp.ne.b32 %p1, %r1, %r82;
+; CHECK-NEXT: mov.b32 %r82, %r1;
+; CHECK-NEXT: @%p1 bra $L__BB6_1;
+; CHECK-NEXT: // %bb.2: // %atomicrmw.end
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r1;
+; CHECK-NEXT: mov.b16 %rs12, 0x0000;
+; CHECK-NEXT: mov.b32 %r2, {%rs11, %rs12};
+; CHECK-NEXT: mov.b32 %r13, 0;
+; CHECK-NEXT: add.s64 %rd6, %rd5, 2;
+; CHECK-NEXT: mov.b32 {_, %rs2}, %r25;
+; CHECK-NEXT: and.b64 %rd1, %rd6, -4;
+; CHECK-NEXT: cvt.u32.u64 %r32, %rd6;
+; CHECK-NEXT: and.b32 %r33, %r32, 3;
+; CHECK-NEXT: shl.b32 %r3, %r33, 3;
+; CHECK-NEXT: mov.b32 %r34, 65535;
+; CHECK-NEXT: shl.b32 %r35, %r34, %r3;
+; CHECK-NEXT: not.b32 %r4, %r35;
+; CHECK-NEXT: ld.b32 %r83, [%rd1];
+; CHECK-NEXT: $L__BB6_3: // %atomicrmw.start5
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r36, %r83, %r3;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r36;
+; CHECK-NEXT: max.f16 %rs14, %rs13, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r37, %rs14;
+; CHECK-NEXT: shl.b32 %r38, %r37, %r3;
+; CHECK-NEXT: and.b32 %r39, %r83, %r4;
+; CHECK-NEXT: or.b32 %r40, %r39, %r38;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r83, %r40;
+; CHECK-NEXT: setp.ne.b32 %p2, %r5, %r83;
+; CHECK-NEXT: mov.b32 %r83, %r5;
+; CHECK-NEXT: @%p2 bra $L__BB6_3;
+; CHECK-NEXT: // %bb.4: // %atomicrmw.end4
+; CHECK-NEXT: shr.u32 %r41, %r5, %r3;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r41;
+; CHECK-NEXT: mov.b32 {%rs16, _}, %r2;
+; CHECK-NEXT: mov.b32 %r18, {%rs16, %rs15};
+; CHECK-NEXT: mov.b32 {%rs3, _}, %r26;
+; CHECK-NEXT: ld.b32 %r84, [%rd5+4];
+; CHECK-NEXT: $L__BB6_5: // %atomicrmw.start20
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r84;
+; CHECK-NEXT: max.f16 %rs18, %rs17, %rs3;
+; CHECK-NEXT: cvt.u32.u16 %r42, %rs18;
+; CHECK-NEXT: and.b32 %r43, %r84, -65536;
+; CHECK-NEXT: or.b32 %r44, %r43, %r42;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r6, [%rd5+4], %r84, %r44;
+; CHECK-NEXT: setp.ne.b32 %p3, %r6, %r84;
+; CHECK-NEXT: mov.b32 %r84, %r6;
+; CHECK-NEXT: @%p3 bra $L__BB6_5;
+; CHECK-NEXT: // %bb.6: // %atomicrmw.end19
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r6;
+; CHECK-NEXT: mov.b32 {_, %rs20}, %r13;
+; CHECK-NEXT: mov.b32 %r7, {%rs19, %rs20};
+; CHECK-NEXT: add.s64 %rd7, %rd5, 6;
+; CHECK-NEXT: mov.b32 {_, %rs4}, %r26;
+; CHECK-NEXT: and.b64 %rd2, %rd7, -4;
+; CHECK-NEXT: cvt.u32.u64 %r45, %rd7;
+; CHECK-NEXT: and.b32 %r46, %r45, 3;
+; CHECK-NEXT: shl.b32 %r8, %r46, 3;
+; CHECK-NEXT: mov.b32 %r47, 65535;
+; CHECK-NEXT: shl.b32 %r48, %r47, %r8;
+; CHECK-NEXT: not.b32 %r9, %r48;
+; CHECK-NEXT: ld.b32 %r85, [%rd2];
+; CHECK-NEXT: $L__BB6_7: // %atomicrmw.start38
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r49, %r85, %r8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r49;
+; CHECK-NEXT: max.f16 %rs22, %rs21, %rs4;
+; CHECK-NEXT: cvt.u32.u16 %r50, %rs22;
+; CHECK-NEXT: shl.b32 %r51, %r50, %r8;
+; CHECK-NEXT: and.b32 %r52, %r85, %r9;
+; CHECK-NEXT: or.b32 %r53, %r52, %r51;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r10, [%rd2], %r85, %r53;
+; CHECK-NEXT: setp.ne.b32 %p4, %r10, %r85;
+; CHECK-NEXT: mov.b32 %r85, %r10;
+; CHECK-NEXT: @%p4 bra $L__BB6_7;
+; CHECK-NEXT: // %bb.8: // %atomicrmw.end37
+; CHECK-NEXT: shr.u32 %r54, %r10, %r8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r54;
+; CHECK-NEXT: mov.b32 {%rs24, _}, %r7;
+; CHECK-NEXT: mov.b32 %r19, {%rs24, %rs23};
+; CHECK-NEXT: mov.b32 {%rs5, _}, %r27;
+; CHECK-NEXT: ld.b32 %r86, [%rd5+8];
+; CHECK-NEXT: $L__BB6_9: // %atomicrmw.start54
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r86;
+; CHECK-NEXT: max.f16 %rs26, %rs25, %rs5;
+; CHECK-NEXT: cvt.u32.u16 %r55, %rs26;
+; CHECK-NEXT: and.b32 %r56, %r86, -65536;
+; CHECK-NEXT: or.b32 %r57, %r56, %r55;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r11, [%rd5+8], %r86, %r57;
+; CHECK-NEXT: setp.ne.b32 %p5, %r11, %r86;
+; CHECK-NEXT: mov.b32 %r86, %r11;
+; CHECK-NEXT: @%p5 bra $L__BB6_9;
+; CHECK-NEXT: // %bb.10: // %atomicrmw.end53
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r11;
+; CHECK-NEXT: mov.b32 {_, %rs28}, %r13;
+; CHECK-NEXT: mov.b32 %r12, {%rs27, %rs28};
+; CHECK-NEXT: add.s64 %rd8, %rd5, 10;
+; CHECK-NEXT: mov.b32 {_, %rs6}, %r27;
+; CHECK-NEXT: and.b64 %rd3, %rd8, -4;
+; CHECK-NEXT: cvt.u32.u64 %r58, %rd8;
+; CHECK-NEXT: and.b32 %r59, %r58, 3;
+; CHECK-NEXT: shl.b32 %r14, %r59, 3;
+; CHECK-NEXT: mov.b32 %r60, 65535;
+; CHECK-NEXT: shl.b32 %r61, %r60, %r14;
+; CHECK-NEXT: not.b32 %r15, %r61;
+; CHECK-NEXT: ld.b32 %r87, [%rd3];
+; CHECK-NEXT: $L__BB6_11: // %atomicrmw.start72
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r62, %r87, %r14;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r62;
+; CHECK-NEXT: max.f16 %rs30, %rs29, %rs6;
+; CHECK-NEXT: cvt.u32.u16 %r63, %rs30;
+; CHECK-NEXT: shl.b32 %r64, %r63, %r14;
+; CHECK-NEXT: and.b32 %r65, %r87, %r15;
+; CHECK-NEXT: or.b32 %r66, %r65, %r64;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r16, [%rd3], %r87, %r66;
+; CHECK-NEXT: setp.ne.b32 %p6, %r16, %r87;
+; CHECK-NEXT: mov.b32 %r87, %r16;
+; CHECK-NEXT: @%p6 bra $L__BB6_11;
+; CHECK-NEXT: // %bb.12: // %atomicrmw.end71
+; CHECK-NEXT: shr.u32 %r67, %r16, %r14;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r67;
+; CHECK-NEXT: mov.b32 {%rs32, _}, %r12;
+; CHECK-NEXT: mov.b32 %r20, {%rs32, %rs31};
+; CHECK-NEXT: mov.b32 {%rs7, _}, %r28;
+; CHECK-NEXT: ld.b32 %r88, [%rd5+12];
+; CHECK-NEXT: $L__BB6_13: // %atomicrmw.start88
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cvt.u16.u32 %rs33, %r88;
+; CHECK-NEXT: max.f16 %rs34, %rs33, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r68, %rs34;
+; CHECK-NEXT: and.b32 %r69, %r88, -65536;
+; CHECK-NEXT: or.b32 %r70, %r69, %r68;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r17, [%rd5+12], %r88, %r70;
+; CHECK-NEXT: setp.ne.b32 %p7, %r17, %r88;
+; CHECK-NEXT: mov.b32 %r88, %r17;
+; CHECK-NEXT: @%p7 bra $L__BB6_13;
+; CHECK-NEXT: // %bb.14: // %atomicrmw.end87
+; CHECK-NEXT: cvt.u16.u32 %rs35, %r17;
+; CHECK-NEXT: mov.b32 {_, %rs36}, %r13;
+; CHECK-NEXT: mov.b32 %r21, {%rs35, %rs36};
+; CHECK-NEXT: add.s64 %rd9, %rd5, 14;
+; CHECK-NEXT: mov.b32 {_, %rs8}, %r28;
+; CHECK-NEXT: and.b64 %rd4, %rd9, -4;
+; CHECK-NEXT: cvt.u32.u64 %r71, %rd9;
+; CHECK-NEXT: and.b32 %r72, %r71, 3;
+; CHECK-NEXT: shl.b32 %r22, %r72, 3;
+; CHECK-NEXT: mov.b32 %r73, 65535;
+; CHECK-NEXT: shl.b32 %r74, %r73, %r22;
+; CHECK-NEXT: not.b32 %r23, %r74;
+; CHECK-NEXT: ld.b32 %r89, [%rd4];
+; CHECK-NEXT: $L__BB6_15: // %atomicrmw.start106
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r75, %r89, %r22;
+; CHECK-NEXT: cvt.u16.u32 %rs37, %r75;
+; CHECK-NEXT: max.f16 %rs38, %rs37, %rs8;
+; CHECK-NEXT: cvt.u32.u16 %r76, %rs38;
+; CHECK-NEXT: shl.b32 %r77, %r76, %r22;
+; CHECK-NEXT: and.b32 %r78, %r89, %r23;
+; CHECK-NEXT: or.b32 %r79, %r78, %r77;
+; CHECK-NEXT: atom.relaxed.sys.cas.b32 %r24, [%rd4], %r89, %r79;
+; CHECK-NEXT: setp.ne.b32 %p8, %r24, %r89;
+; CHECK-NEXT: mov.b32 %r89, %r24;
+; CHECK-NEXT: @%p8 bra $L__BB6_15;
+; CHECK-NEXT: // %bb.16: // %atomicrmw.end105
+; CHECK-NEXT: shr.u32 %r80, %r24, %r22;
+; CHECK-NEXT: cvt.u16.u32 %rs39, %r80;
+; CHECK-NEXT: mov.b32 {%rs40, _}, %r21;
+; CHECK-NEXT: mov.b32 %r81, {%rs40, %rs39};
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r18, %r19, %r20, %r81};
+; CHECK-NEXT: ret;
+entry:
+ %old = atomicrmw elementwise fmax ptr %addr, <8 x half> %val monotonic, align 16
+ ret <8 x half> %old
+}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-elementwise.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-elementwise.ll
new file mode 100644
index 0000000000000..e339213461ef0
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-elementwise.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='require<libcall-lowering-info>,atomic-expand' -mcpu=gfx942 %s | FileCheck -check-prefixes=GFX942 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='require<libcall-lowering-info>,atomic-expand' -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX900 %s
+
+; gfx942 has ds_pk_add_f16/bf16, so atomicrmw fadd <2 x half> is natively
+; supported as a whole-value operation. The elementwise modifier should be
+; dropped and the instruction preserved.
+;
+; gfx900 lacks ds_pk_add, so the elementwise atomicrmw must be scalarized
+; into per-lane scalar atomicrmw instructions.
+
+define <2 x half> @elementwise_fadd_v2f16_lds(ptr addrspace(3) %ptr, <2 x half> %val) {
+; GFX942-LABEL: define <2 x half> @elementwise_fadd_v2f16_lds(
+; GFX942-SAME: ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR]], <2 x half> [[VAL]] monotonic, align 4
+; GFX942-NEXT: ret <2 x half> [[RES]]
+;
+; GFX900-LABEL: define <2 x half> @elementwise_fadd_v2f16_lds(
+; GFX900-SAME: ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX900-NEXT: [[LANE_VAL:%.*]] = extractelement <2 x half> [[VAL]], i64 0
+; GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR]], align 4
+; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]]
+; GFX900: [[ATOMICRMW_START]]:
+; GFX900-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; GFX900-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; GFX900-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[LANE_VAL]]
+; GFX900-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16
+; GFX900-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; GFX900-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; GFX900-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900: [[ATOMICRMW_END]]:
+; GFX900-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; GFX900-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
+; GFX900-NEXT: [[LANE_OLD:%.*]] = insertelement <2 x half> zeroinitializer, half [[TMP5]], i64 0
+; GFX900-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(3) [[PTR]], i64 0, i64 1
+; GFX900-NEXT: [[LANE_VAL2:%.*]] = extractelement <2 x half> [[VAL]], i64 1
+; GFX900-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[LANE_PTR]], i32 -4)
+; GFX900-NEXT: [[TMP6:%.*]] = ptrtoint ptr addrspace(3) [[LANE_PTR]] to i32
+; GFX900-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP6]], 3
+; GFX900-NEXT: [[TMP7:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX900-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP7]]
+; GFX900-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GFX900-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; GFX900-NEXT: br label %[[ATOMICRMW_START5:.*]]
+; GFX900: [[ATOMICRMW_START5]]:
+; GFX900-NEXT: [[LOADED6:%.*]] = phi i32 [ [[TMP8]], %[[ATOMICRMW_END]] ], [ [[NEWLOADED14:%.*]], %[[ATOMICRMW_START5]] ]
+; GFX900-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED6]], [[TMP7]]
+; GFX900-NEXT: [[EXTRACTED7:%.*]] = trunc i32 [[SHIFTED]] to i16
+; GFX900-NEXT: [[TMP9:%.*]] = bitcast i16 [[EXTRACTED7]] to half
+; GFX900-NEXT: [[NEW8:%.*]] = fadd half [[TMP9]], [[LANE_VAL2]]
+; GFX900-NEXT: [[TMP10:%.*]] = bitcast half [[NEW8]] to i16
+; GFX900-NEXT: [[EXTENDED9:%.*]] = zext i16 [[TMP10]] to i32
+; GFX900-NEXT: [[SHIFTED10:%.*]] = shl nuw i32 [[EXTENDED9]], [[TMP7]]
+; GFX900-NEXT: [[UNMASKED11:%.*]] = and i32 [[LOADED6]], [[INV_MASK]]
+; GFX900-NEXT: [[INSERTED12:%.*]] = or i32 [[UNMASKED11]], [[SHIFTED10]]
+; GFX900-NEXT: [[TMP11:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED6]], i32 [[INSERTED12]] monotonic monotonic, align 4
+; GFX900-NEXT: [[SUCCESS13:%.*]] = extractvalue { i32, i1 } [[TMP11]], 1
+; GFX900-NEXT: [[NEWLOADED14]] = extractvalue { i32, i1 } [[TMP11]], 0
+; GFX900-NEXT: br i1 [[SUCCESS13]], label %[[ATOMICRMW_END4:.*]], label %[[ATOMICRMW_START5]]
+; GFX900: [[ATOMICRMW_END4]]:
+; GFX900-NEXT: [[SHIFTED15:%.*]] = lshr i32 [[NEWLOADED14]], [[TMP7]]
+; GFX900-NEXT: [[EXTRACTED16:%.*]] = trunc i32 [[SHIFTED15]] to i16
+; GFX900-NEXT: [[TMP12:%.*]] = bitcast i16 [[EXTRACTED16]] to half
+; GFX900-NEXT: [[LANE_OLD3:%.*]] = insertelement <2 x half> [[LANE_OLD]], half [[TMP12]], i64 1
+; GFX900-NEXT: ret <2 x half> [[LANE_OLD3]]
+;
+ %res = atomicrmw elementwise fadd ptr addrspace(3) %ptr, <2 x half> %val monotonic, align 4
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @elementwise_fadd_v2bf16_lds(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
+; GFX942-LABEL: define <2 x bfloat> @elementwise_fadd_v2bf16_lds(
+; GFX942-SAME: ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VAL:%.*]]) #[[ATTR0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR]], <2 x bfloat> [[VAL]] monotonic, align 4
+; GFX942-NEXT: ret <2 x bfloat> [[RES]]
+;
+; GFX900-LABEL: define <2 x bfloat> @elementwise_fadd_v2bf16_lds(
+; GFX900-SAME: ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VAL:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT: [[LANE_VAL:%.*]] = extractelement <2 x bfloat> [[VAL]], i64 0
+; GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR]], align 4
+; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]]
+; GFX900: [[ATOMICRMW_START]]:
+; GFX900-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; GFX900-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; GFX900-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[LANE_VAL]]
+; GFX900-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; GFX900-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; GFX900-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; GFX900-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
+; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900: [[ATOMICRMW_END]]:
+; GFX900-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; GFX900-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; GFX900-NEXT: [[LANE_OLD:%.*]] = insertelement <2 x bfloat> zeroinitializer, bfloat [[TMP5]], i64 0
+; GFX900-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <2 x bfloat>, ptr addrspace(3) [[PTR]], i64 0, i64 1
+; GFX900-NEXT: [[LANE_VAL2:%.*]] = extractelement <2 x bfloat> [[VAL]], i64 1
+; GFX900-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[LANE_PTR]], i32 -4)
+; GFX900-NEXT: [[TMP6:%.*]] = ptrtoint ptr addrspace(3) [[LANE_PTR]] to i32
+; GFX900-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP6]], 3
+; GFX900-NEXT: [[TMP7:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX900-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP7]]
+; GFX900-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GFX900-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; GFX900-NEXT: br label %[[ATOMICRMW_START5:.*]]
+; GFX900: [[ATOMICRMW_START5]]:
+; GFX900-NEXT: [[LOADED6:%.*]] = phi i32 [ [[TMP8]], %[[ATOMICRMW_END]] ], [ [[NEWLOADED14:%.*]], %[[ATOMICRMW_START5]] ]
+; GFX900-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED6]], [[TMP7]]
+; GFX900-NEXT: [[EXTRACTED7:%.*]] = trunc i32 [[SHIFTED]] to i16
+; GFX900-NEXT: [[TMP9:%.*]] = bitcast i16 [[EXTRACTED7]] to bfloat
+; GFX900-NEXT: [[NEW8:%.*]] = fadd bfloat [[TMP9]], [[LANE_VAL2]]
+; GFX900-NEXT: [[TMP10:%.*]] = bitcast bfloat [[NEW8]] to i16
+; GFX900-NEXT: [[EXTENDED9:%.*]] = zext i16 [[TMP10]] to i32
+; GFX900-NEXT: [[SHIFTED10:%.*]] = shl nuw i32 [[EXTENDED9]], [[TMP7]]
+; GFX900-NEXT: [[UNMASKED11:%.*]] = and i32 [[LOADED6]], [[INV_MASK]]
+; GFX900-NEXT: [[INSERTED12:%.*]] = or i32 [[UNMASKED11]], [[SHIFTED10]]
+; GFX900-NEXT: [[TMP11:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED6]], i32 [[INSERTED12]] monotonic monotonic, align 4
+; GFX900-NEXT: [[SUCCESS13:%.*]] = extractvalue { i32, i1 } [[TMP11]], 1
+; GFX900-NEXT: [[NEWLOADED14]] = extractvalue { i32, i1 } [[TMP11]], 0
+; GFX900-NEXT: br i1 [[SUCCESS13]], label %[[ATOMICRMW_END4:.*]], label %[[ATOMICRMW_START5]]
+; GFX900: [[ATOMICRMW_END4]]:
+; GFX900-NEXT: [[SHIFTED15:%.*]] = lshr i32 [[NEWLOADED14]], [[TMP7]]
+; GFX900-NEXT: [[EXTRACTED16:%.*]] = trunc i32 [[SHIFTED15]] to i16
+; GFX900-NEXT: [[TMP12:%.*]] = bitcast i16 [[EXTRACTED16]] to bfloat
+; GFX900-NEXT: [[LANE_OLD3:%.*]] = insertelement <2 x bfloat> [[LANE_OLD]], bfloat [[TMP12]], i64 1
+; GFX900-NEXT: ret <2 x bfloat> [[LANE_OLD3]]
+;
+ %res = atomicrmw elementwise fadd ptr addrspace(3) %ptr, <2 x bfloat> %val monotonic, align 4
+ ret <2 x bfloat> %res
+}
diff --git a/llvm/test/Transforms/AtomicExpand/NVPTX/expand-atomic-rmw-elementwise.ll b/llvm/test/Transforms/AtomicExpand/NVPTX/expand-atomic-rmw-elementwise.ll
new file mode 100644
index 0000000000000..d6137ade559aa
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/NVPTX/expand-atomic-rmw-elementwise.ll
@@ -0,0 +1,262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S %s -passes='require<libcall-lowering-info>,atomic-expand' -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefix=SM90PTX83 %s
+; RUN: opt -S %s -passes='require<libcall-lowering-info>,atomic-expand' -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx81 | FileCheck --check-prefix=SM90PTX81 %s
+; RUN: opt -S %s -passes='require<libcall-lowering-info>,atomic-expand' -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 | FileCheck --check-prefix=SM90 %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; v4f32 is 128 bits: preserved at PTX 83+ (.b128 support), expanded otherwise.
+define <4 x float> @fadd_v4f32_monotonic(ptr %addr, <4 x float> %val) {
+; SM90PTX83-LABEL: define <4 x float> @fadd_v4f32_monotonic(
+; SM90PTX83-SAME: ptr [[ADDR:%.*]], <4 x float> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; SM90PTX83-NEXT: [[ENTRY:.*:]]
+; SM90PTX83-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x float> [[VAL]], i64 0
+; SM90PTX83-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] monotonic, align 16
+; SM90PTX83-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90PTX83-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX83-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x float> [[VAL]], i64 1
+; SM90PTX83-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] monotonic, align 4
+; SM90PTX83-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90PTX83-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 2
+; SM90PTX83-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x float> [[VAL]], i64 2
+; SM90PTX83-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[LANE_PTR3]], float [[LANE_VAL4]] monotonic, align 8
+; SM90PTX83-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x float> [[LANE_OLD2]], float [[TMP2]], i64 2
+; SM90PTX83-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 3
+; SM90PTX83-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x float> [[VAL]], i64 3
+; SM90PTX83-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[LANE_PTR6]], float [[LANE_VAL7]] monotonic, align 4
+; SM90PTX83-NEXT: [[OLD:%.*]] = insertelement <4 x float> [[LANE_OLD5]], float [[TMP3]], i64 3
+; SM90PTX83-NEXT: ret <4 x float> [[OLD]]
+;
+; SM90PTX81-LABEL: define <4 x float> @fadd_v4f32_monotonic(
+; SM90PTX81-SAME: ptr [[ADDR:%.*]], <4 x float> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; SM90PTX81-NEXT: [[ENTRY:.*:]]
+; SM90PTX81-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x float> [[VAL]], i64 0
+; SM90PTX81-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] monotonic, align 16
+; SM90PTX81-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90PTX81-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX81-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x float> [[VAL]], i64 1
+; SM90PTX81-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] monotonic, align 4
+; SM90PTX81-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90PTX81-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 2
+; SM90PTX81-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x float> [[VAL]], i64 2
+; SM90PTX81-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[LANE_PTR3]], float [[LANE_VAL4]] monotonic, align 8
+; SM90PTX81-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x float> [[LANE_OLD2]], float [[TMP2]], i64 2
+; SM90PTX81-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 3
+; SM90PTX81-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x float> [[VAL]], i64 3
+; SM90PTX81-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[LANE_PTR6]], float [[LANE_VAL7]] monotonic, align 4
+; SM90PTX81-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x float> [[LANE_OLD5]], float [[TMP3]], i64 3
+; SM90PTX81-NEXT: ret <4 x float> [[LANE_OLD8]]
+;
+; SM90-LABEL: define <4 x float> @fadd_v4f32_monotonic(
+; SM90-SAME: ptr [[ADDR:%.*]], <4 x float> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; SM90-NEXT: [[ENTRY:.*:]]
+; SM90-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x float> [[VAL]], i64 0
+; SM90-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] monotonic, align 16
+; SM90-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x float> [[VAL]], i64 1
+; SM90-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] monotonic, align 4
+; SM90-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 2
+; SM90-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x float> [[VAL]], i64 2
+; SM90-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[LANE_PTR3]], float [[LANE_VAL4]] monotonic, align 8
+; SM90-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x float> [[LANE_OLD2]], float [[TMP2]], i64 2
+; SM90-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 3
+; SM90-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x float> [[VAL]], i64 3
+; SM90-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[LANE_PTR6]], float [[LANE_VAL7]] monotonic, align 4
+; SM90-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x float> [[LANE_OLD5]], float [[TMP3]], i64 3
+; SM90-NEXT: ret <4 x float> [[LANE_OLD8]]
+;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <4 x float> %val monotonic, align 16
+ ret <4 x float> %old
+}
+
+; v2f32 is 64 bits: preserved at PTX 81+ (vector atom support), expanded at PTX 78.
+define <2 x float> @fadd_v2f32_monotonic(ptr %addr, <2 x float> %val) {
+; SM90PTX83-LABEL: define <2 x float> @fadd_v2f32_monotonic(
+; SM90PTX83-SAME: ptr [[ADDR:%.*]], <2 x float> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90PTX83-NEXT: [[ENTRY:.*:]]
+; SM90PTX83-NEXT: [[LANE_VAL:%.*]] = extractelement <2 x float> [[VAL]], i64 0
+; SM90PTX83-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] monotonic, align 8
+; SM90PTX83-NEXT: [[LANE_OLD:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90PTX83-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <2 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX83-NEXT: [[LANE_VAL1:%.*]] = extractelement <2 x float> [[VAL]], i64 1
+; SM90PTX83-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] monotonic, align 4
+; SM90PTX83-NEXT: [[OLD:%.*]] = insertelement <2 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90PTX83-NEXT: ret <2 x float> [[OLD]]
+;
+; SM90PTX81-LABEL: define <2 x float> @fadd_v2f32_monotonic(
+; SM90PTX81-SAME: ptr [[ADDR:%.*]], <2 x float> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90PTX81-NEXT: [[ENTRY:.*:]]
+; SM90PTX81-NEXT: [[LANE_VAL:%.*]] = extractelement <2 x float> [[VAL]], i64 0
+; SM90PTX81-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] monotonic, align 8
+; SM90PTX81-NEXT: [[LANE_OLD:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90PTX81-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <2 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX81-NEXT: [[LANE_VAL1:%.*]] = extractelement <2 x float> [[VAL]], i64 1
+; SM90PTX81-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] monotonic, align 4
+; SM90PTX81-NEXT: [[OLD:%.*]] = insertelement <2 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90PTX81-NEXT: ret <2 x float> [[OLD]]
+;
+; SM90-LABEL: define <2 x float> @fadd_v2f32_monotonic(
+; SM90-SAME: ptr [[ADDR:%.*]], <2 x float> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90-NEXT: [[ENTRY:.*:]]
+; SM90-NEXT: [[LANE_VAL:%.*]] = extractelement <2 x float> [[VAL]], i64 0
+; SM90-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] monotonic, align 8
+; SM90-NEXT: [[LANE_OLD:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <2 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90-NEXT: [[LANE_VAL1:%.*]] = extractelement <2 x float> [[VAL]], i64 1
+; SM90-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] monotonic, align 4
+; SM90-NEXT: [[LANE_OLD2:%.*]] = insertelement <2 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90-NEXT: ret <2 x float> [[LANE_OLD2]]
+;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <2 x float> %val monotonic, align 8
+ ret <2 x float> %old
+}
+
+; seq_cst: fence splitting applies when preserved.
+define <4 x float> @fadd_v4f32_seq_cst(ptr %addr, <4 x float> %val) {
+; SM90PTX83-LABEL: define <4 x float> @fadd_v4f32_seq_cst(
+; SM90PTX83-SAME: ptr [[ADDR:%.*]], <4 x float> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90PTX83-NEXT: [[ENTRY:.*:]]
+; SM90PTX83-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x float> [[VAL]], i64 0
+; SM90PTX83-NEXT: fence seq_cst
+; SM90PTX83-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] acquire, align 16
+; SM90PTX83-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90PTX83-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX83-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x float> [[VAL]], i64 1
+; SM90PTX83-NEXT: fence seq_cst
+; SM90PTX83-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] acquire, align 4
+; SM90PTX83-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90PTX83-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 2
+; SM90PTX83-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x float> [[VAL]], i64 2
+; SM90PTX83-NEXT: fence seq_cst
+; SM90PTX83-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[LANE_PTR3]], float [[LANE_VAL4]] acquire, align 8
+; SM90PTX83-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x float> [[LANE_OLD2]], float [[TMP2]], i64 2
+; SM90PTX83-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 3
+; SM90PTX83-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x float> [[VAL]], i64 3
+; SM90PTX83-NEXT: fence seq_cst
+; SM90PTX83-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[LANE_PTR6]], float [[LANE_VAL7]] acquire, align 4
+; SM90PTX83-NEXT: [[OLD:%.*]] = insertelement <4 x float> [[LANE_OLD5]], float [[TMP3]], i64 3
+; SM90PTX83-NEXT: ret <4 x float> [[OLD]]
+;
+; SM90PTX81-LABEL: define <4 x float> @fadd_v4f32_seq_cst(
+; SM90PTX81-SAME: ptr [[ADDR:%.*]], <4 x float> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90PTX81-NEXT: [[ENTRY:.*:]]
+; SM90PTX81-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x float> [[VAL]], i64 0
+; SM90PTX81-NEXT: fence seq_cst
+; SM90PTX81-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] acquire, align 16
+; SM90PTX81-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90PTX81-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX81-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x float> [[VAL]], i64 1
+; SM90PTX81-NEXT: fence seq_cst
+; SM90PTX81-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] acquire, align 4
+; SM90PTX81-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90PTX81-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 2
+; SM90PTX81-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x float> [[VAL]], i64 2
+; SM90PTX81-NEXT: fence seq_cst
+; SM90PTX81-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[LANE_PTR3]], float [[LANE_VAL4]] acquire, align 8
+; SM90PTX81-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x float> [[LANE_OLD2]], float [[TMP2]], i64 2
+; SM90PTX81-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 3
+; SM90PTX81-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x float> [[VAL]], i64 3
+; SM90PTX81-NEXT: fence seq_cst
+; SM90PTX81-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[LANE_PTR6]], float [[LANE_VAL7]] acquire, align 4
+; SM90PTX81-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x float> [[LANE_OLD5]], float [[TMP3]], i64 3
+; SM90PTX81-NEXT: ret <4 x float> [[LANE_OLD8]]
+;
+; SM90-LABEL: define <4 x float> @fadd_v4f32_seq_cst(
+; SM90-SAME: ptr [[ADDR:%.*]], <4 x float> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90-NEXT: [[ENTRY:.*:]]
+; SM90-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x float> [[VAL]], i64 0
+; SM90-NEXT: fence seq_cst
+; SM90-NEXT: [[TMP0:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[LANE_VAL]] acquire, align 16
+; SM90-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP0]], i64 0
+; SM90-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 1
+; SM90-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x float> [[VAL]], i64 1
+; SM90-NEXT: fence seq_cst
+; SM90-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr [[LANE_PTR]], float [[LANE_VAL1]] acquire, align 4
+; SM90-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x float> [[LANE_OLD]], float [[TMP1]], i64 1
+; SM90-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 2
+; SM90-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x float> [[VAL]], i64 2
+; SM90-NEXT: fence seq_cst
+; SM90-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[LANE_PTR3]], float [[LANE_VAL4]] acquire, align 8
+; SM90-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x float> [[LANE_OLD2]], float [[TMP2]], i64 2
+; SM90-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x float>, ptr [[ADDR]], i64 0, i64 3
+; SM90-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x float> [[VAL]], i64 3
+; SM90-NEXT: fence seq_cst
+; SM90-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[LANE_PTR6]], float [[LANE_VAL7]] acquire, align 4
+; SM90-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x float> [[LANE_OLD5]], float [[TMP3]], i64 3
+; SM90-NEXT: ret <4 x float> [[LANE_OLD8]]
+;
+entry:
+ %old = atomicrmw elementwise fadd ptr %addr, <4 x float> %val seq_cst, align 16
+ ret <4 x float> %old
+}
+
+; Integer add is never a native vector atom — always expanded.
+define <4 x i32> @add_v4i32(ptr %addr, <4 x i32> %val) {
+; SM90PTX83-LABEL: define <4 x i32> @add_v4i32(
+; SM90PTX83-SAME: ptr [[ADDR:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90PTX83-NEXT: [[ENTRY:.*:]]
+; SM90PTX83-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x i32> [[VAL]], i64 0
+; SM90PTX83-NEXT: [[TMP0:%.*]] = atomicrmw add ptr [[ADDR]], i32 [[LANE_VAL]] monotonic, align 16
+; SM90PTX83-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP0]], i64 0
+; SM90PTX83-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX83-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x i32> [[VAL]], i64 1
+; SM90PTX83-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[LANE_PTR]], i32 [[LANE_VAL1]] monotonic, align 4
+; SM90PTX83-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x i32> [[LANE_OLD]], i32 [[TMP1]], i64 1
+; SM90PTX83-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 2
+; SM90PTX83-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x i32> [[VAL]], i64 2
+; SM90PTX83-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[LANE_PTR3]], i32 [[LANE_VAL4]] monotonic, align 8
+; SM90PTX83-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x i32> [[LANE_OLD2]], i32 [[TMP2]], i64 2
+; SM90PTX83-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 3
+; SM90PTX83-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x i32> [[VAL]], i64 3
+; SM90PTX83-NEXT: [[TMP3:%.*]] = atomicrmw add ptr [[LANE_PTR6]], i32 [[LANE_VAL7]] monotonic, align 4
+; SM90PTX83-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x i32> [[LANE_OLD5]], i32 [[TMP3]], i64 3
+; SM90PTX83-NEXT: ret <4 x i32> [[LANE_OLD8]]
+;
+; SM90PTX81-LABEL: define <4 x i32> @add_v4i32(
+; SM90PTX81-SAME: ptr [[ADDR:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90PTX81-NEXT: [[ENTRY:.*:]]
+; SM90PTX81-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x i32> [[VAL]], i64 0
+; SM90PTX81-NEXT: [[TMP0:%.*]] = atomicrmw add ptr [[ADDR]], i32 [[LANE_VAL]] monotonic, align 16
+; SM90PTX81-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP0]], i64 0
+; SM90PTX81-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 1
+; SM90PTX81-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x i32> [[VAL]], i64 1
+; SM90PTX81-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[LANE_PTR]], i32 [[LANE_VAL1]] monotonic, align 4
+; SM90PTX81-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x i32> [[LANE_OLD]], i32 [[TMP1]], i64 1
+; SM90PTX81-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 2
+; SM90PTX81-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x i32> [[VAL]], i64 2
+; SM90PTX81-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[LANE_PTR3]], i32 [[LANE_VAL4]] monotonic, align 8
+; SM90PTX81-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x i32> [[LANE_OLD2]], i32 [[TMP2]], i64 2
+; SM90PTX81-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 3
+; SM90PTX81-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x i32> [[VAL]], i64 3
+; SM90PTX81-NEXT: [[TMP3:%.*]] = atomicrmw add ptr [[LANE_PTR6]], i32 [[LANE_VAL7]] monotonic, align 4
+; SM90PTX81-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x i32> [[LANE_OLD5]], i32 [[TMP3]], i64 3
+; SM90PTX81-NEXT: ret <4 x i32> [[LANE_OLD8]]
+;
+; SM90-LABEL: define <4 x i32> @add_v4i32(
+; SM90-SAME: ptr [[ADDR:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
+; SM90-NEXT: [[ENTRY:.*:]]
+; SM90-NEXT: [[LANE_VAL:%.*]] = extractelement <4 x i32> [[VAL]], i64 0
+; SM90-NEXT: [[TMP0:%.*]] = atomicrmw add ptr [[ADDR]], i32 [[LANE_VAL]] monotonic, align 16
+; SM90-NEXT: [[LANE_OLD:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP0]], i64 0
+; SM90-NEXT: [[LANE_PTR:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 1
+; SM90-NEXT: [[LANE_VAL1:%.*]] = extractelement <4 x i32> [[VAL]], i64 1
+; SM90-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[LANE_PTR]], i32 [[LANE_VAL1]] monotonic, align 4
+; SM90-NEXT: [[LANE_OLD2:%.*]] = insertelement <4 x i32> [[LANE_OLD]], i32 [[TMP1]], i64 1
+; SM90-NEXT: [[LANE_PTR3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 2
+; SM90-NEXT: [[LANE_VAL4:%.*]] = extractelement <4 x i32> [[VAL]], i64 2
+; SM90-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[LANE_PTR3]], i32 [[LANE_VAL4]] monotonic, align 8
+; SM90-NEXT: [[LANE_OLD5:%.*]] = insertelement <4 x i32> [[LANE_OLD2]], i32 [[TMP2]], i64 2
+; SM90-NEXT: [[LANE_PTR6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ADDR]], i64 0, i64 3
+; SM90-NEXT: [[LANE_VAL7:%.*]] = extractelement <4 x i32> [[VAL]], i64 3
+; SM90-NEXT: [[TMP3:%.*]] = atomicrmw add ptr [[LANE_PTR6]], i32 [[LANE_VAL7]] monotonic, align 4
+; SM90-NEXT: [[LANE_OLD8:%.*]] = insertelement <4 x i32> [[LANE_OLD5]], i32 [[TMP3]], i64 3
+; SM90-NEXT: ret <4 x i32> [[LANE_OLD8]]
+;
+entry:
+ %old = atomicrmw elementwise add ptr %addr, <4 x i32> %val monotonic, align 16
+ ret <4 x i32> %old
+}
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-elementwise.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-elementwise.ll
new file mode 100644
index 0000000000000..c9e2067b5dffe
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-elementwise.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S %s -passes='require<libcall-lowering-info>,atomic-expand' -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define <4 x i32> @elem_add(ptr %p, <4 x i32> %v) {
+; CHECK-LABEL: define <4 x i32> @elem_add(
+; CHECK-SAME: ptr [[P:%.*]], <4 x i32> [[V:%.*]]) {
+; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x i32> [[V]], i64 0
+; CHECK-NEXT: [[O0:%.*]] = atomicrmw add ptr [[P]], i32 [[V0]] monotonic, align 16
+; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[O0]], i64 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P]], i64 0, i64 1
+; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x i32> [[V]], i64 1
+; CHECK-NEXT: [[O1:%.*]] = atomicrmw add ptr [[P1]], i32 [[V1]] monotonic, align 4
+; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[O1]], i64 1
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P]], i64 0, i64 2
+; CHECK-NEXT: [[V2:%.*]] = extractelement <4 x i32> [[V]], i64 2
+; CHECK-NEXT: [[O2:%.*]] = atomicrmw add ptr [[P2]], i32 [[V2]] monotonic, align 8
+; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[O2]], i64 2
+; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P]], i64 0, i64 3
+; CHECK-NEXT: [[V3:%.*]] = extractelement <4 x i32> [[V]], i64 3
+; CHECK-NEXT: [[O3:%.*]] = atomicrmw add ptr [[P3]], i32 [[V3]] monotonic, align 4
+; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[O3]], i64 3
+; CHECK-NEXT: ret <4 x i32> [[R3]]
+;
+ %old = atomicrmw elementwise add ptr %p, <4 x i32> %v monotonic
+ ret <4 x i32> %old
+}
>From 0b0cc564191eb8c135bc3025da424b129088742f Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 31 Mar 2026 05:28:32 +0000
Subject: [PATCH 2/3] change order
---
llvm/include/llvm/IR/IRBuilder.h | 4 ++--
llvm/include/llvm/IR/Instructions.h | 5 +++--
llvm/lib/AsmParser/LLParser.cpp | 3 ++-
llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +-
llvm/lib/IR/Instructions.cpp | 6 +++---
5 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 9ebac9b8c79e9..d61fdf71ffc0a 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -1948,8 +1948,8 @@ class IRBuilderBase {
Align = llvm::Align(DL.getTypeStoreSize(Val->getType()));
}
- return Insert(
- new AtomicRMWInst(Op, Ptr, Val, *Align, Ordering, SSID, Elementwise));
+ return Insert(new AtomicRMWInst(Op, Ptr, Val, *Align, Ordering, SSID,
+ nullptr, Elementwise));
}
CallInst *CreateStructuredGEP(Type *BaseType, Value *PtrBase,
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 9267be115b0cd..5948e0ac551bf 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -809,8 +809,9 @@ class AtomicRMWInst : public Instruction {
public:
LLVM_ABI AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
Align Alignment, AtomicOrdering Ordering,
- SyncScope::ID SSID, bool Elementwise = false,
- InsertPosition InsertBefore = nullptr);
+ SyncScope::ID SSID,
+ InsertPosition InsertBefore = nullptr,
+ bool Elementwise = false);
// allocate space for exactly two operands
void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 997eea293291d..0f96f0c9ecf50 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -9102,7 +9102,8 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
Val->getType()));
AtomicRMWInst *RMWI = new AtomicRMWInst(Operation, Ptr, Val,
Alignment.value_or(DefaultAlignment),
- Ordering, SSID, IsElementwise);
+ Ordering, SSID, nullptr,
+ IsElementwise);
RMWI->setVolatile(IsVolatile);
Inst = RMWI;
return AteExtraComma ? InstExtraComma : InstNormal;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 60c8c7812ce81..c8ae658c5551a 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -6739,7 +6739,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
Align(TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
I = new AtomicRMWInst(Operation, Ptr, Val, *Alignment, Ordering, SSID,
- IsElementwise);
+ nullptr, IsElementwise);
ResTypeID = ValTypeID;
cast<AtomicRMWInst>(I)->setVolatile(IsVol);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index f940893ab0296..514377d18e8f5 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1458,8 +1458,8 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
Align Alignment, AtomicOrdering Ordering,
- SyncScope::ID SSID, bool Elementwise,
- InsertPosition InsertBefore)
+ SyncScope::ID SSID, InsertPosition InsertBefore,
+ bool Elementwise)
: Instruction(Val->getType(), AtomicRMW, AllocMarker, InsertBefore) {
Init(Operation, Ptr, Val, Alignment, Ordering, SSID, Elementwise);
}
@@ -4401,7 +4401,7 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
AtomicRMWInst *AtomicRMWInst::cloneImpl() const {
AtomicRMWInst *Result = new AtomicRMWInst(
getOperation(), getOperand(0), getOperand(1), getAlign(), getOrdering(),
- getSyncScopeID(), isElementwise());
+ getSyncScopeID(), nullptr, isElementwise());
Result->setVolatile(isVolatile());
return Result;
}
>From 0f26383fc8da263f5325ddad8f841d89bf5d88c4 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 31 Mar 2026 05:30:19 +0000
Subject: [PATCH 3/3] format
---
llvm/lib/AsmParser/LLParser.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 0f96f0c9ecf50..f2ff8c3c5bb0d 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -9100,10 +9100,9 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
const Align DefaultAlignment(
PFS.getFunction().getDataLayout().getTypeStoreSize(
Val->getType()));
- AtomicRMWInst *RMWI = new AtomicRMWInst(Operation, Ptr, Val,
- Alignment.value_or(DefaultAlignment),
- Ordering, SSID, nullptr,
- IsElementwise);
+ AtomicRMWInst *RMWI = new AtomicRMWInst(
+ Operation, Ptr, Val, Alignment.value_or(DefaultAlignment), Ordering, SSID,
+ nullptr, IsElementwise);
RMWI->setVolatile(IsVolatile);
Inst = RMWI;
return AteExtraComma ? InstExtraComma : InstNormal;
More information about the llvm-commits
mailing list