[llvm] [AMDGPU][Attributor] Infer `inreg` attribute in `AMDGPUAttributor` (PR #101609)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Wed May 21 19:18:55 PDT 2025
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/101609
>From 1c64e7ccac7fb52a62918303a2c1097a339d5593 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 21 May 2025 22:18:35 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Infer `inreg` attribute in
`AMDGPUAttributor`
---
llvm/include/llvm/IR/Argument.h | 2 +
llvm/lib/IR/Function.cpp | 4 +
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 189 ++++++++++++-
llvm/test/CodeGen/AMDGPU/aa-as-infer.ll | 16 +-
...butor-accesslist-offsetbins-out-of-sync.ll | 5 +-
llvm/test/CodeGen/AMDGPU/inreg-inference.ll | 257 ++++++++++++++++++
.../AMDGPU/remove-no-kernel-id-attribute.ll | 15 +-
7 files changed, 473 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/inreg-inference.ll
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
index 5be58d7eca060..a1729b847e9fb 100644
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -173,6 +173,8 @@ class Argument final : public Value {
/// Remove attributes from an argument.
void removeAttr(Attribute::AttrKind Kind);
+ void removeAttr(StringRef Kind);
+
void removeAttrs(const AttributeMask &AM);
/// Check if an argument has a given attribute.
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 85c28b0205691..ff36e334ebaee 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -340,6 +340,10 @@ void Argument::removeAttr(Attribute::AttrKind Kind) {
getParent()->removeParamAttr(getArgNo(), Kind);
}
+void Argument::removeAttr(StringRef Kind) {
+ getParent()->removeParamAttr(getArgNo(), Kind);
+}
+
void Argument::removeAttrs(const AttributeMask &AM) {
AttributeList AL = getParent()->getAttributes();
AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index ee887447972bf..10b41a5c8a328 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -14,7 +14,10 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/InitializePasses.h"
@@ -1299,6 +1302,130 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
+struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName()
+ const std::string getName() const override { return "AAAMDGPUUniform"; }
+
+ const std::string getAsStr(Attributor *A) const override {
+ return getAssumed() ? "inreg" : "non-inreg";
+ }
+
+ void trackStatistics() const override {}
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDGPUUniform
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDGPUUniform::ID = 0;
+
+struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
+ AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
+ : AAAMDGPUUniform(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ Argument *Arg = getAssociatedArgument();
+ CallingConv::ID CC = Arg->getParent()->getCallingConv();
+ if (Arg->hasAttribute(Attribute::InReg)) {
+ indicateOptimisticFixpoint();
+ return;
+ }
+ if (AMDGPU::isEntryFunctionCC(CC)) {
+ // We only use isArgPassedInSGPR on kernel entry function argument, so the
+ // potential i1 argument change will not affect this.
+ if (AMDGPU::isArgPassedInSGPR(Arg))
+ indicateOptimisticFixpoint();
+ else
+ indicatePessimisticFixpoint();
+ }
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ unsigned ArgNo = getAssociatedArgument()->getArgNo();
+
+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
+ CallBase *CB = ACS.getInstruction();
+ Value *V = CB->getArgOperandUse(ArgNo);
+ if (isa<Constant>(V))
+ return true;
+ Function *F = nullptr;
+ if (auto *Arg = dyn_cast<Argument>(V)) {
+ auto *AA =
+ A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(*Arg));
+ if (AA)
+ return AA->isValidState();
+ F = Arg->getParent();
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ F = I->getFunction();
+ }
+
+ if (F) {
+ auto *UA =
+ A.getInfoCache()
+ .getAnalysisResultForFunction<UniformityInfoAnalysis>(*F);
+ return UA && UA->isUniform(V);
+ }
+
+ // What else can it be here?
+ return false;
+ };
+
+ bool UsedAssumedInformation = true;
+ if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
+ UsedAssumedInformation))
+ return indicatePessimisticFixpoint();
+
+ if (!UsedAssumedInformation)
+ return indicateOptimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ Argument *Arg = getAssociatedArgument();
+ if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
+ return ChangeStatus::UNCHANGED;
+ // If the argument already has inreg attribute, we will not do anything
+ // about it.
+ if (Arg->hasAttribute(Attribute::InReg))
+ return ChangeStatus::UNCHANGED;
+ // Add both inreg and "uniform" attribute to the argument. We will emit a
+ // readfirstlane at each call site for inreg uniform argument, and the
+ // "uniform" attribute will be removed later.
+ LLVMContext &Ctx = Arg->getContext();
+ return A.manifestAttrs(getIRPosition(),
+ {Attribute::get(Ctx, Attribute::InReg),
+ Attribute::get(Ctx, "uniform")});
+ }
+};
+
+AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ switch (IRP.getPositionKind()) {
+ case IRPosition::IRP_ARGUMENT:
+ return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
+ // TODO: Since inreg is also allowed for return value, maybe we need to add
+ // AAAMDGPUUniformCallSiteReturned?
+ default:
+ llvm_unreachable("not a valid position for AAAMDGPUUniform");
+ }
+}
+
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1367,6 +1494,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
return Changed;
}
+/// Emit the readfirstlane intrinsic for all inreg uniform function arguments at
+/// each call site. The inreg uniform attribute combination is set by
+/// AAAMDGPUUniform. This function provides a workaround for a downstream issue
+/// where failing to emit a waterfall loop for 'inreg' arguments may result in
+/// an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
+/// loop for inreg uniform arguments here, because the 'inreg' attribute set by
+/// AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
+/// appropriate.
+static bool emitReadFirstLaneForInregUniformArgs(Module &M) {
+ std::vector<std::pair<CallBase *, unsigned>> WorkList;
+
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ for (Argument &Arg : F.args()) {
+ if (!Arg.hasAttribute(Attribute::InReg) || !Arg.hasAttribute("uniform"))
+ continue;
+ unsigned ArgNo = Arg.getArgNo();
+ for (Use &U : F.uses()) {
+ auto *CB = dyn_cast<CallBase>(U.getUser());
+ if (!CB)
+ continue;
+ // We will skip the call site argument when itself is an inreg argument.
+ // In this case, it will already be in SGPR.
+ if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand(ArgNo))) {
+ if (CSArg->hasAttribute(Attribute::InReg))
+ continue;
+ }
+ WorkList.emplace_back(CB, ArgNo);
+ }
+ // We don't count this as changed since it just stays within this pass.
+ Arg.removeAttr("uniform");
+ }
+ }
+
+ if (WorkList.empty())
+ return false;
+
+ for (auto &[CB, ArgNo] : WorkList) {
+ Value *V = CB->getArgOperand(ArgNo);
+ IRBuilder<> Builder(CB);
+ Value *NewV = Builder.CreateIntrinsic(V->getType(),
+ Intrinsic::amdgcn_readfirstlane, {V});
+ CB->setArgOperand(ArgNo, NewV);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ if (I->use_empty())
+ I->eraseFromParent();
+ }
+ }
+
+ return true;
+}
+
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options,
ThinOrFullLTOPhase LTOPhase) {
@@ -1385,7 +1565,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
- &AAInstanceInfo::ID});
+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1438,11 +1618,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
IRPosition::value(*CmpX->getPointerOperand()));
}
}
+
+ if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+ for (auto &Arg : F->args())
+ A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
+ }
}
bool Changed = A.run() == ChangeStatus::CHANGED;
Changed |= updateWavesPerEU(M, TM);
+ Changed |= emitReadFirstLaneForInregUniformArgs(M);
return Changed;
}
@@ -1470,6 +1656,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<CycleInfoWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
}
StringRef getPassName() const override { return "AMDGPU Attributor"; }
diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
index d1a6414fe49ae..b38ddec738139 100644
--- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
@@ -90,7 +90,7 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4
define internal void @can_infer_cmpxchg(ptr %word) {
; CHECK-LABEL: define internal void @can_infer_cmpxchg(
-; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
; CHECK-NEXT: [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -144,7 +144,7 @@ define internal void @can_not_infer_cmpxchg(ptr %word) {
define internal void @can_infer_atomicrmw(ptr %word) {
; CHECK-LABEL: define internal void @can_infer_atomicrmw(
-; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
; CHECK-NEXT: [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -215,13 +215,17 @@ define void @foo(ptr addrspace(3) %val) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr
-; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
-; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
+; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP2]])
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]])
-; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
-; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT: [[TMP3:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
+; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
+; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP4]])
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]])
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
index d58a62408427d..6a67c4e9e153a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
@@ -8,7 +8,7 @@
define internal fastcc void @foo(ptr %kg) {
; CHECK-LABEL: define internal fastcc void @foo(
-; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
@@ -80,7 +80,8 @@ define amdgpu_kernel void @kernel() #0 {
; CHECK-NEXT: [[KGLOBALS_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[SD]] to ptr
; CHECK-NEXT: [[NUM_CLOSURE_I_I:%.*]] = getelementptr i8, ptr addrspace(5) [[SD]], i32 276
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[NUM_CLOSURE_I_I]], align 4
-; CHECK-NEXT: call fastcc void @foo(ptr [[KGLOBALS_ASCAST1]])
+; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[KGLOBALS_ASCAST1]])
+; CHECK-NEXT: call fastcc void @foo(ptr [[TMP0]])
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/inreg-inference.ll
new file mode 100644
index 0000000000000..050dc995fc762
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inreg-inference.ll
@@ -0,0 +1,257 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s
+
+ at g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g3 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g4 = protected addrspace(1) externally_initialized global i32 0, align 4
+
+;.
+; CHECK: @g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g3 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g4 = protected addrspace(1) externally_initialized global i32 0, align 4
+;.
+define internal fastcc void @callee_infer(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define {{[^@]+}}@callee_infer
+; CHECK-SAME: (ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_infer(ptr addrspace(1) %p1, ptr addrspace(1) %p2, i32 %x) {
+; CHECK-LABEL: define {{[^@]+}}@kernel_infer
+; CHECK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) @g1)
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]])
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) @g2)
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]])
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[TMP2]], i32 [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) @g1)
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 1)
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[TMP4]], i32 [[TMP5]])
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) @g2)
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 2)
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[TMP6]], i32 [[TMP7]])
+; CHECK-NEXT: [[TMP8:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]])
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[TMP8]], i32 [[TMP9]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp = icmp sgt i32 %x, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 %x)
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 %x)
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 1)
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 2)
+ tail call fastcc void @callee_infer(ptr addrspace(1) %p, i32 %x)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_infer_indirect(ptr addrspace(1) %p1, ptr addrspace(1) %p2, i32 %x) {
+; CHECK-LABEL: define {{[^@]+}}@kernel_infer_indirect
+; CHECK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]], i32 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: store ptr @kernel_infer, ptr addrspace(5) [[FN]], align 8
+; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g1, i32 [[X]])
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g2, i32 [[X]])
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g1, i32 1)
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g2, i32 2)
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[P]], i32 [[X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %fn = alloca ptr, addrspace(5)
+ %cmp = icmp sgt i32 %x, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ store ptr @kernel_infer, ptr addrspace(5) %fn
+ %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr
+ tail call fastcc void %fn.cast(ptr addrspace(1) @g1, i32 %x)
+ tail call fastcc void %fn.cast(ptr addrspace(1) @g2, i32 %x)
+ tail call fastcc void %fn.cast(ptr addrspace(1) @g1, i32 1)
+ tail call fastcc void %fn.cast(ptr addrspace(1) @g2, i32 2)
+ tail call fastcc void %fn.cast(ptr addrspace(1) %p, i32 %x)
+ ret void
+}
+
+define internal fastcc void @callee_not_infer(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define {{[^@]+}}@callee_not_infer
+; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_not_infer(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define {{[^@]+}}@kernel_not_infer
+; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]]
+; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[Q]], i32 [[ID_X]])
+; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[P]], i32 [[ID_X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x
+ %d = load i32, ptr addrspace(1) %gep
+ %cmp = icmp sgt i32 %d, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ tail call fastcc void @callee_not_infer(ptr addrspace(1) %q, i32 %id.x)
+ tail call fastcc void @callee_not_infer(ptr addrspace(1) %p, i32 %id.x)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_not_infer_indirect(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define {{[^@]+}}@kernel_not_infer_indirect
+; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]]
+; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: store ptr @kernel_not_infer, ptr addrspace(5) [[FN]], align 8
+; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[Q]], i32 [[ID_X]])
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[P]], i32 [[ID_X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %fn = alloca ptr, addrspace(5)
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x
+ %d = load i32, ptr addrspace(1) %gep
+ %cmp = icmp sgt i32 %d, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ store ptr @kernel_not_infer, ptr addrspace(5) %fn
+ %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr
+ tail call fastcc void %fn.cast(ptr addrspace(1) %q, i32 %id.x)
+ tail call fastcc void %fn.cast(ptr addrspace(1) %p, i32 %id.x)
+ ret void
+}
+
+define internal fastcc void @cs_callee_not_infer(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define {{[^@]+}}@cs_callee_not_infer
+; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define amdgpu_cs void @cs_kernel_not_infer(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define {{[^@]+}}@cs_kernel_not_infer
+; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]]
+; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) [[Q]], i32 [[ID_X]])
+; CHECK-NEXT: tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) [[P]], i32 [[ID_X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x
+ %d = load i32, ptr addrspace(1) %gep
+ %cmp = icmp sgt i32 %d, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) %q, i32 %id.x)
+ tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) %p, i32 %id.x)
+ ret void
+}
+
+define internal fastcc void @cs_callee_not_infer_indirect(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define {{[^@]+}}@cs_callee_not_infer_indirect
+; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+
+define amdgpu_cs void @cs_kernel_not_infer_indirect(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define {{[^@]+}}@cs_kernel_not_infer_indirect
+; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]]
+; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: store ptr @cs_callee_not_infer_indirect, ptr addrspace(5) [[FN]], align 8
+; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[Q]], i32 [[ID_X]])
+; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[P]], i32 [[ID_X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %fn = alloca ptr, addrspace(5)
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x
+ %d = load i32, ptr addrspace(1) %gep
+ %cmp = icmp sgt i32 %d, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ store ptr @cs_callee_not_infer_indirect, ptr addrspace(5) %fn
+ %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr
+ tail call fastcc void %fn.cast(ptr addrspace(1) %q, i32 %id.x)
+ tail call fastcc void %fn.cast(ptr addrspace(1) %p, i32 %id.x)
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index ed4e691fbf154..f49ca5f17f742 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -148,14 +148,15 @@ define amdgpu_kernel void @kernel_lds() {
define internal i16 @mutual_recursion_0(i16 %arg) {
; CHECK-LABEL: define internal i16 @mutual_recursion_0(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
; CHECK-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) [[RECURSIVE_KERNEL_LDS1]], align 2
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 7
-; CHECK-NEXT: [[RET:%.*]] = call i16 @mutual_recursion_1(i16 [[LD]])
+; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[LD]])
+; CHECK-NEXT: [[RET:%.*]] = call i16 @mutual_recursion_1(i16 [[TMP3]])
; CHECK-NEXT: [[ADD:%.*]] = add i16 [[RET]], 1
; CHECK-NEXT: ret i16 [[ADD]]
;
@@ -168,7 +169,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
define internal void @mutual_recursion_1(i16 %arg) {
; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]])
; CHECK-NEXT: ret void
;
@@ -180,7 +181,8 @@ define amdgpu_kernel void @kernel_lds_recursion() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
; CHECK-SAME: ) #[[ATTR5:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]]
-; CHECK-NEXT: call void @mutual_recursion_0(i16 0)
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 0)
+; CHECK-NEXT: call void @mutual_recursion_0(i16 [[TMP1]])
; CHECK-NEXT: ret void
;
call void @mutual_recursion_0(i16 0)
@@ -197,8 +199,9 @@ define amdgpu_kernel void @kernel_lds_recursion() {
; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
; CHECK: [[META0]] = !{i32 0, i32 1}
; CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
More information about the llvm-commits
mailing list