[clang] [AMDGPU] Change the representation of double literals in operands (PR #68740)

Thu Oct 12 12:22:16 PDT 2023

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/68740

>From cc9e065a9218eb36750a2c2a4a4d08fae3f329fa Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 4 Oct 2023 13:36:25 -0700
Subject: [PATCH 01/10] [AMDGPU] Change the representation of double literals
 in operands

A 64-bit literal can be used as a 32-bit zero or sign extended
operand. In case of double zeroes are added to the low 32 bits.
Currently asm parser stores only high 32 bits of a double into
an operand. To support codegen as requested by the
https://github.com/llvm/llvm-project/issues/67781 we need to
change the representation to store a full 64-bit value so that
codegen can simply add immediates to an instruction.

There is some code to support compatibility with existing tests
and asm kernels. We allow to use short hex strings to represent
only a high 32 bit of a double value as a valid literal.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 21 ++++++++++++--
 .../Disassembler/AMDGPUDisassembler.cpp       | 28 ++++++++++++++-----
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  9 ++++--
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 12 +++++---
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +-
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp      |  3 ++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  4 ++-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  7 +++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++
 9 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 35656bcaea1af7f..0553d3f20b21c56 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2140,9 +2140,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
           const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
           "Can't encode literal as exact 64-bit floating-point operand. "
           "Low 32-bits will be set to zero");
+          Val &= 0xffffffff00000000u;
         }
 
-        Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+        Inst.addOperand(MCOperand::createImm(Val));
         setImmKindLiteral();
         return;
       }
@@ -2241,7 +2242,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       return;
     }
 
-    Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+    if (isInt<32>(Val) || isUInt<32>(Val))
+      Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? Val << 32 : Lo_32(Val);
+
+    Inst.addOperand(MCOperand::createImm(Val));
     setImmKindLiteral();
     return;
 
@@ -4297,7 +4301,18 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
       continue;
 
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
-      uint32_t Value = static_cast<uint32_t>(MO.getImm());
+      uint64_t Value = static_cast<uint64_t>(MO.getImm());
+      bool IsFP = AMDGPU::isSISrcFPOperand(Desc, OpIdx);
+      bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP);
+
+      if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) {
+        Error(getLitLoc(Operands), "invalid operand for instruction");
+        return false;
+      }
+
+      if (IsFP && IsValid32Op)
+        Value = Hi_32(Value);
+
       if (NumLiterals == 0 || LiteralValue != Value) {
         LiteralValue = Value;
         ++NumLiterals;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 439762bc6caf786..8c49c9a9c87772e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -378,6 +378,15 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
+static DecodeStatus
+decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                       const MCDisassembler *Decoder) {
+  assert(Imm < (1 << 9) && "9-bit encoding");
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm,
+                                            false, 64, true));
+}
+
 static DecodeStatus
 DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
                              const MCDisassembler *Decoder) {
@@ -1218,7 +1227,7 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
   return MCOperand::createImm(Literal);
 }
 
-MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
@@ -1228,9 +1237,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
                         Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal = eatBytes<uint32_t>(Bytes);
+    Literal = Literal64 = eatBytes<uint32_t>(Bytes);
+    if (ExtendFP64)
+      Literal64 <<= 32;
   }
-  return MCOperand::createImm(Literal);
+  return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1447,7 +1458,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
 
 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
                                           bool MandatoryLiteral,
-                                          unsigned ImmWidth) const {
+                                          unsigned ImmWidth,
+                                          bool IsFP) const {
   using namespace AMDGPU::EncValues;
 
   assert(Val < 1024); // enum10
@@ -1459,13 +1471,15 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
     return createRegOperand(IsAGPR ? getAgprClassId(Width)
                                    : getVgprClassId(Width), Val - VGPR_MIN);
   }
-  return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth);
+  return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
+                            IsFP);
 }
 
 MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
                                                  unsigned Val,
                                                  bool MandatoryLiteral,
-                                                 unsigned ImmWidth) const {
+                                                 unsigned ImmWidth,
+                                                 bool IsFP) const {
   // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
   // decoded earlier.
   assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
@@ -1493,7 +1507,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
       // Keep a sentinel value for deferred setting
       return MCOperand::createImm(LITERAL_CONST);
     else
-      return decodeLiteralConstant();
+      return decodeLiteralConstant(IsFP && ImmWidth == 64);
   }
 
   switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5f3b277d577ff7c..865db2b26307b43 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -97,6 +97,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   const unsigned TargetMaxInstBytes;
   mutable ArrayRef<uint8_t> Bytes;
   mutable uint32_t Literal;
+  mutable uint64_t Literal64;
   mutable bool HasLiteral;
   mutable std::optional<bool> EnableWavefrontSize32;
 
@@ -229,15 +230,17 @@ class AMDGPUDisassembler : public MCDisassembler {
   static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm);
 
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
-  MCOperand decodeLiteralConstant() const;
+  MCOperand decodeLiteralConstant(bool ExtendFP64) const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
                         bool MandatoryLiteral = false,
-                        unsigned ImmWidth = 0) const;
+                        unsigned ImmWidth = 0,
+                        bool IsFP = false) const;
 
   MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
                                bool MandatoryLiteral = false,
-                               unsigned ImmWidth = 0) const;
+                               unsigned ImmWidth = 0,
+                               bool IsFP = false) const;
 
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ad4c48a8d65581a..40e92f00a9e52a6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -426,7 +426,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
 
 void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
                                          const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
+                                         raw_ostream &O, bool IsFP) {
   int64_t SImm = static_cast<int64_t>(Imm);
   if (SImm >= -16 && SImm <= 64) {
     O << SImm;
@@ -454,6 +454,8 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
   else if (Imm == 0x3fc45f306dc9c882 &&
            STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494309189532";
+  else if (IsFP && AMDGPU::isValid32BitLiteral(Imm, true))
+    O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
   else {
     assert(isUInt<32>(Imm) || isInt<32>(Imm));
 
@@ -605,11 +607,13 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       printImmediate32(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_IMM_INT64:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+      printImmediate64(Op.getImm(), STI, O, false);
+      break;
+    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-      printImmediate64(Op.getImm(), STI, O);
+      printImmediate64(Op.getImm(), STI, O, true);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
@@ -671,7 +675,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       if (RCBits == 32)
         printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
       else if (RCBits == 64)
-        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O);
+        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true);
       else
         llvm_unreachable("Invalid register class size");
     }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3b14faab136b35a..dc83547a4afe049 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -91,7 +91,7 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
-                        raw_ostream &O);
+                        raw_ostream &O, bool IsFP);
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printRegularOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 57ccb523c70eee6..d93f747bf6f0a64 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -411,6 +411,9 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
     } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
       llvm_unreachable("Must be immediate or expr");
 
+    if (Desc.operands()[i].OperandType == AMDGPU::OPERAND_REG_IMM_FP64)
+      Imm = Hi_32(Imm);
+
     support::endian::write<uint32_t>(CB, Imm, support::endianness::little);
 
     // Only one literal value allowed
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c3c5bfae405aa45..ea06e85fb400c1b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1263,7 +1263,9 @@ def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">;
-def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrc_f64";
+}
 def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">;
 def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d0ad763d9e6cc1..e7907b28abedf9d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2519,6 +2519,13 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   return Lo16 == Hi16;
 }
 
+bool isValid32BitLiteral(uint64_t Val, bool IsFP) {
+  if (IsFP)
+    return !(Val & 0xffffffffu);
+
+  return isUInt<32>(Val) || isInt<32>(Val);
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 297a69f54d63721..fbe9adfd74fa9c6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1290,6 +1290,9 @@ bool isInlinableIntLiteralV216(int32_t Literal);
 LLVM_READNONE
 bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isValid32BitLiteral(uint64_t Val, bool IsFP);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
 bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);

>From b74ad67d36f6832fe582ab33080fdda4e5da5408 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 4 Oct 2023 13:36:25 -0700
Subject: [PATCH 02/10] [AMDGPU] Change the representation of double literals
 in operands

A 64-bit literal can be used as a 32-bit zero or sign extended
operand. In case of double zeroes are added to the low 32 bits.
Currently asm parser stores only high 32 bits of a double into
an operand. To support codegen as requested by the
https://github.com/llvm/llvm-project/issues/67781 we need to
change the representation to store a full 64-bit value so that
codegen can simply add immediates to an instruction.

There is some code to support compatibility with existing tests
and asm kernels. We allow to use short hex strings to represent
only a high 32 bit of a double value as a valid literal.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 21 ++++++++++++--
 .../Disassembler/AMDGPUDisassembler.cpp       | 28 ++++++++++++++-----
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  9 ++++--
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 12 +++++---
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +-
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp      |  5 +++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  4 ++-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  7 +++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++
 9 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 1e07e8deb560fcb..253a2e98f0cb685 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2141,9 +2141,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
           const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
           "Can't encode literal as exact 64-bit floating-point operand. "
           "Low 32-bits will be set to zero");
+          Val &= 0xffffffff00000000u;
         }
 
-        Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+        Inst.addOperand(MCOperand::createImm(Val));
         setImmKindLiteral();
         return;
       }
@@ -2242,7 +2243,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       return;
     }
 
-    Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+    if (isInt<32>(Val) || isUInt<32>(Val))
+      Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? Val << 32 : Lo_32(Val);
+
+    Inst.addOperand(MCOperand::createImm(Val));
     setImmKindLiteral();
     return;
 
@@ -4309,7 +4313,18 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
       continue;
 
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
-      uint32_t Value = static_cast<uint32_t>(MO.getImm());
+      uint64_t Value = static_cast<uint64_t>(MO.getImm());
+      bool IsFP = AMDGPU::isSISrcFPOperand(Desc, OpIdx);
+      bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP);
+
+      if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) {
+        Error(getLitLoc(Operands), "invalid operand for instruction");
+        return false;
+      }
+
+      if (IsFP && IsValid32Op)
+        Value = Hi_32(Value);
+
       if (NumLiterals == 0 || LiteralValue != Value) {
         LiteralValue = Value;
         ++NumLiterals;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index a504a5e86760bd6..83d973dc62e7770 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -378,6 +378,15 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
+static DecodeStatus
+decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                       const MCDisassembler *Decoder) {
+  assert(Imm < (1 << 9) && "9-bit encoding");
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm,
+                                            false, 64, true));
+}
+
 static DecodeStatus
 DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
                              const MCDisassembler *Decoder) {
@@ -1219,7 +1228,7 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
   return MCOperand::createImm(Literal);
 }
 
-MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
@@ -1229,9 +1238,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
                         Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal = eatBytes<uint32_t>(Bytes);
+    Literal = Literal64 = eatBytes<uint32_t>(Bytes);
+    if (ExtendFP64)
+      Literal64 <<= 32;
   }
-  return MCOperand::createImm(Literal);
+  return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1448,7 +1459,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
 
 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
                                           bool MandatoryLiteral,
-                                          unsigned ImmWidth) const {
+                                          unsigned ImmWidth,
+                                          bool IsFP) const {
   using namespace AMDGPU::EncValues;
 
   assert(Val < 1024); // enum10
@@ -1460,13 +1472,15 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
     return createRegOperand(IsAGPR ? getAgprClassId(Width)
                                    : getVgprClassId(Width), Val - VGPR_MIN);
   }
-  return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth);
+  return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
+                            IsFP);
 }
 
 MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
                                                  unsigned Val,
                                                  bool MandatoryLiteral,
-                                                 unsigned ImmWidth) const {
+                                                 unsigned ImmWidth,
+                                                 bool IsFP) const {
   // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
   // decoded earlier.
   assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
@@ -1494,7 +1508,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
       // Keep a sentinel value for deferred setting
       return MCOperand::createImm(LITERAL_CONST);
     else
-      return decodeLiteralConstant();
+      return decodeLiteralConstant(IsFP && ImmWidth == 64);
   }
 
   switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5f3b277d577ff7c..865db2b26307b43 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -97,6 +97,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   const unsigned TargetMaxInstBytes;
   mutable ArrayRef<uint8_t> Bytes;
   mutable uint32_t Literal;
+  mutable uint64_t Literal64;
   mutable bool HasLiteral;
   mutable std::optional<bool> EnableWavefrontSize32;
 
@@ -229,15 +230,17 @@ class AMDGPUDisassembler : public MCDisassembler {
   static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm);
 
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
-  MCOperand decodeLiteralConstant() const;
+  MCOperand decodeLiteralConstant(bool ExtendFP64) const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
                         bool MandatoryLiteral = false,
-                        unsigned ImmWidth = 0) const;
+                        unsigned ImmWidth = 0,
+                        bool IsFP = false) const;
 
   MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
                                bool MandatoryLiteral = false,
-                               unsigned ImmWidth = 0) const;
+                               unsigned ImmWidth = 0,
+                               bool IsFP = false) const;
 
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ad4c48a8d65581a..40e92f00a9e52a6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -426,7 +426,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
 
 void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
                                          const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
+                                         raw_ostream &O, bool IsFP) {
   int64_t SImm = static_cast<int64_t>(Imm);
   if (SImm >= -16 && SImm <= 64) {
     O << SImm;
@@ -454,6 +454,8 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
   else if (Imm == 0x3fc45f306dc9c882 &&
            STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494309189532";
+  else if (IsFP && AMDGPU::isValid32BitLiteral(Imm, true))
+    O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
   else {
     assert(isUInt<32>(Imm) || isInt<32>(Imm));
 
@@ -605,11 +607,13 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       printImmediate32(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_IMM_INT64:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+      printImmediate64(Op.getImm(), STI, O, false);
+      break;
+    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-      printImmediate64(Op.getImm(), STI, O);
+      printImmediate64(Op.getImm(), STI, O, true);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
@@ -671,7 +675,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       if (RCBits == 32)
         printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
       else if (RCBits == 64)
-        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O);
+        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true);
       else
         llvm_unreachable("Invalid register class size");
     }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3b14faab136b35a..dc83547a4afe049 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -91,7 +91,7 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
-                        raw_ostream &O);
+                        raw_ostream &O, bool IsFP);
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printRegularOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 21243f80e055499..d93f747bf6f0a64 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -411,7 +411,10 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
     } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
       llvm_unreachable("Must be immediate or expr");
 
-    support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little);
+    if (Desc.operands()[i].OperandType == AMDGPU::OPERAND_REG_IMM_FP64)
+      Imm = Hi_32(Imm);
+
+    support::endian::write<uint32_t>(CB, Imm, support::endianness::little);
 
     // Only one literal value allowed
     break;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c3c5bfae405aa45..ea06e85fb400c1b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1263,7 +1263,9 @@ def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">;
-def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrc_f64";
+}
 def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">;
 def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d0ad763d9e6cc1..e7907b28abedf9d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2519,6 +2519,13 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   return Lo16 == Hi16;
 }
 
+bool isValid32BitLiteral(uint64_t Val, bool IsFP) {
+  if (IsFP)
+    return !(Val & 0xffffffffu);
+
+  return isUInt<32>(Val) || isInt<32>(Val);
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 297a69f54d63721..fbe9adfd74fa9c6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1290,6 +1290,9 @@ bool isInlinableIntLiteralV216(int32_t Literal);
 LLVM_READNONE
 bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isValid32BitLiteral(uint64_t Val, bool IsFP);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
 bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);

>From 6e7dffa80693b714a15c644c3f8df71d2ac14be8 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 12 Oct 2023 01:16:56 -0700
Subject: [PATCH 03/10] [AMDGPU] Make clang-format happy with disasm cganges

---
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp      | 13 ++++++-------
 .../Target/AMDGPU/Disassembler/AMDGPUDisassembler.h |  6 ++----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 83d973dc62e7770..d74fd0b3a9ea74e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -378,13 +378,13 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
-static DecodeStatus
-decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                       const MCDisassembler *Decoder) {
+static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
   assert(Imm < (1 << 9) && "9-bit encoding");
   auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
-  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm,
-                                            false, 64, true));
+  return addOperand(
+      Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64, true));
 }
 
 static DecodeStatus
@@ -1459,8 +1459,7 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
 
 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
                                           bool MandatoryLiteral,
-                                          unsigned ImmWidth,
-                                          bool IsFP) const {
+                                          unsigned ImmWidth, bool IsFP) const {
   using namespace AMDGPU::EncValues;
 
   assert(Val < 1024); // enum10
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 865db2b26307b43..91b73b593d61617 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -233,14 +233,12 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeLiteralConstant(bool ExtendFP64) const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
-                        bool MandatoryLiteral = false,
-                        unsigned ImmWidth = 0,
+                        bool MandatoryLiteral = false, unsigned ImmWidth = 0,
                         bool IsFP = false) const;
 
   MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
                                bool MandatoryLiteral = false,
-                               unsigned ImmWidth = 0,
-                               bool IsFP = false) const;
+                               unsigned ImmWidth = 0, bool IsFP = false) const;
 
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;

>From 0ce6255a50584863c2f462390cac6a63ccb5f136 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 12 Oct 2023 11:26:48 +0100
Subject: [PATCH 04/10] [HIP][LLVM][Opt] Add LLVM support for `hipstdpar`

This patch adds the LLVM changes needed for enabling HIP parallel algorithm offload on AMDGPU targets. What we do here is add two passes, one mandatory and one optional:

1. HipStdParAcceleratorCodeSelectionPass is mandatory, depends on CallGraphAnalysis, and implements the following transform:

    - Traverse the call-graph, and check for functions that are roots for accelerator execution (at the moment, these are GPU kernels exclusively, and would originate in the accelerator specific algorithm library the toolchain uses as an implementation detail);
    - Starting from a root, do a BFS to find all functions that are reachable (called directly or indirectly via a call- chain) and record them;
    - After having done the above for all roots in the Module, we have the computed the set of reachable functions, which is the union of roots and functions reachable from roots;
    - All functions that are not in the reachable set are removed; for the special case where the reachable set is empty we completely clear the module;

2. HipStdParAllocationInterpositionPass is optional, is meant as a fallback with restricted functionality for cases where on-demand paging is unavailable on a platform, and implements the following transform:
    - Iterate all functions in a Module;
    - If a function's name is in a predefined set of allocation / deallocation that the runtime implementation is allowed and expected to interpose, replace all its uses with the equivalent accelerator aware function, iff the latter is available;
        - If the accelerator aware equivalent is unavailable we warn, but compilation will go ahead, which means that it is possible to get issues around the accelerator trying to access inaccessible memory at run time;
    - We rely on direct name matching as opposed to using the new alloc-kind family of attributes and / or the LibCall analysis pass because some of the legacy functions that need replacing would not carry the former or be identified by the latter.

Reviewed by: JonChesterfield, yaxunl

Differential Revision: https://reviews.llvm.org/D155856
---
 .../llvm/Transforms/HipStdPar/HipStdPar.h     |  46 +++
 llvm/lib/Passes/CMakeLists.txt                |   1 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   1 +
 llvm/lib/Passes/PassRegistry.def              |   3 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   8 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Transforms/CMakeLists.txt            |   1 +
 llvm/lib/Transforms/HipStdPar/CMakeLists.txt  |  18 +
 llvm/lib/Transforms/HipStdPar/HipStdPar.cpp   | 312 ++++++++++++++++++
 .../HipStdPar/accelerator-code-selection.ll   | 116 +++++++
 .../HipStdPar/allocation-interposition.ll     | 221 +++++++++++++
 .../HipStdPar/allocation-no-interposition.ll  | 161 +++++++++
 .../Transforms/HipStdPar/unsupported-asm.ll   |  12 +
 .../HipStdPar/unsupported-builtins.ll         |  11 +
 .../unsupported-thread-local-direct-use.ll    |  14 +
 .../unsupported-thread-local-indirect-use.ll  |  14 +
 17 files changed, 941 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h
 create mode 100644 llvm/lib/Transforms/HipStdPar/CMakeLists.txt
 create mode 100644 llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
 create mode 100644 llvm/test/Transforms/HipStdPar/accelerator-code-selection.ll
 create mode 100644 llvm/test/Transforms/HipStdPar/allocation-interposition.ll
 create mode 100644 llvm/test/Transforms/HipStdPar/allocation-no-interposition.ll
 create mode 100644 llvm/test/Transforms/HipStdPar/unsupported-asm.ll
 create mode 100644 llvm/test/Transforms/HipStdPar/unsupported-builtins.ll
 create mode 100644 llvm/test/Transforms/HipStdPar/unsupported-thread-local-direct-use.ll
 create mode 100644 llvm/test/Transforms/HipStdPar/unsupported-thread-local-indirect-use.ll

diff --git a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h
new file mode 100644
index 000000000000000..9df093d8d5d5977
--- /dev/null
+++ b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h
@@ -0,0 +1,46 @@
+//===--------- HipStdPar.h - Standard Parallelism passes --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// AcceleratorCodeSelection - Identify all functions reachable from a kernel,
+/// removing those that are unreachable.
+///
+/// AllocationInterposition - Forward calls to allocation / deallocation
+//  functions to runtime provided equivalents that allocate memory that is
+//  accessible for an accelerator
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_HIPSTDPAR_HIPSTDPAR_H
+#define LLVM_TRANSFORMS_HIPSTDPAR_HIPSTDPAR_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+class ModuleAnaysisManager;
+
+class HipStdParAcceleratorCodeSelectionPass
+  : public PassInfoMixin<HipStdParAcceleratorCodeSelectionPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+  static bool isRequired() { return true; }
+};
+
+class HipStdParAllocationInterpositionPass
+  : public PassInfoMixin<HipStdParAllocationInterpositionPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_HIPSTDPAR_HIPSTDPAR_H
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 576d0f3ff442983..e42edfe94969745 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_component_library(LLVMPasses
   CodeGen
   Core
   Coroutines
+  HipStdPar
   IPO
   InstCombine
   IRPrinter
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 985ff88139323c6..fde759026e5d780 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -94,6 +94,7 @@
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "llvm/Transforms/Coroutines/CoroSplit.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/Annotation2Metadata.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 44f78c45fa1c8e2..600f8d43caaf216 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "llvm/Transforms/Coroutines/CoroSplit.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/Annotation2Metadata.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index df9f14920f29161..91782d661ddd7b7 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -64,6 +64,9 @@ MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("globalopt", GlobalOptPass())
 MODULE_PASS("globalsplit", GlobalSplitPass())
+MODULE_PASS("hipstdpar-select-accelerator-code",
+  HipStdParAcceleratorCodeSelectionPass())
+MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b7c1f03459cb640..dc7321cd5de9fcd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -50,6 +50,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -348,6 +349,11 @@ static cl::opt<bool> EnableRewritePartialRegUses(
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(false),
     cl::Hidden);
 
+static cl::opt<bool> EnableHipStdPar(
+  "amdgpu-enable-hipstdpar",
+  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
+  cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -699,6 +705,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
           FPM.addPass(AMDGPUSimplifyLibCallsPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        if (EnableHipStdPar)
+          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
       });
 
   PB.registerPipelineEarlySimplificationEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 8124fdd5ddfefec..0c0720890794b66 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -176,6 +176,7 @@ add_llvm_target(AMDGPUCodeGen
   CodeGenTypes
   Core
   GlobalISel
+  HipStdPar
   IPO
   MC
   MIRParser
diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index dda5f6de11e326a..84a7e34147d0843 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(Hello)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
 add_subdirectory(CFGuard)
+add_subdirectory(HipStdPar)
diff --git a/llvm/lib/Transforms/HipStdPar/CMakeLists.txt b/llvm/lib/Transforms/HipStdPar/CMakeLists.txt
new file mode 100644
index 000000000000000..bd397f44c34ff65
--- /dev/null
+++ b/llvm/lib/Transforms/HipStdPar/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_llvm_component_library(LLVMHipStdPar
+  HipStdPar.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/HipStdPar
+
+  DEPENDS
+  intrinsics_gen
+  LLVMAnalysis
+
+  COMPONENT_NAME
+  HipStdPar
+
+  LINK_COMPONENTS
+  Analysis
+  Core
+  Support
+  TransformUtils)
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
new file mode 100644
index 000000000000000..fb7cba9edbdb8b2
--- /dev/null
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -0,0 +1,312 @@
+//===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements two passes that enable HIP C++ Standard Parallelism
+// Support:
+//
+// 1. AcceleratorCodeSelection (required): Given that only algorithms are
+//    accelerated, and that the accelerated implementation exists in the form of
+//    a compute kernel, we assume that only the kernel, and all functions
+//    reachable from it, constitute code that the user expects the accelerator
+//    to execute. Thus, we identify the set of all functions reachable from
+//    kernels, and then remove all unreachable ones. This last part is necessary
+//    because it is possible for code that the user did not expect to execute on
+//    an accelerator to contain constructs that cannot be handled by the target
+//    BE, which cannot be provably demonstrated to be dead code in general, and
+//    thus can lead to mis-compilation. The degenerate case of this is when a
+//    Module contains no kernels (the parent TU had no algorithm invocations fit
+//    for acceleration), which we handle by completely emptying said module.
+//    **NOTE**: The above does not handle indirectly reachable functions i.e.
+//              it is possible to obtain a case where the target of an indirect
+//              call is otherwise unreachable and thus is removed; this
+//              restriction is aligned with the current `-hipstdpar` limitations
+//              and will be relaxed in the future.
+//
+// 2. AllocationInterposition (required only when on-demand paging is
+//    unsupported): Some accelerators or operating systems might not support
+//    transparent on-demand paging. Thus, they would only be able to access
+//    memory that is allocated by an accelerator-aware mechanism. For such cases
+//    the user can opt into enabling allocation / deallocation interposition,
+//    whereby we replace calls to known allocation / deallocation functions with
+//    calls to runtime implemented equivalents that forward the requests to
+//    accelerator-aware interfaces. We also support freeing system allocated
+//    memory that ends up in one of the runtime equivalents, since this can
+//    happen if e.g. a library that was compiled without interposition returns
+//    an allocation that can be validly passed to `free`.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <cassert>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+template<typename T>
+static inline void eraseFromModule(T &ToErase) {
+  ToErase.replaceAllUsesWith(PoisonValue::get(ToErase.getType()));
+  ToErase.eraseFromParent();
+}
+
+static inline bool checkIfSupported(GlobalVariable &G) {
+  if (!G.isThreadLocal())
+    return true;
+
+  G.dropDroppableUses();
+
+  if (!G.isConstantUsed())
+    return true;
+
+  std::string W;
+  raw_string_ostream OS(W);
+
+  OS << "Accelerator does not support the thread_local variable "
+    << G.getName();
+
+  Instruction *I = nullptr;
+  SmallVector<User *> Tmp(G.user_begin(), G.user_end());
+  SmallPtrSet<User *, 5> Visited;
+  do {
+    auto U = std::move(Tmp.back());
+    Tmp.pop_back();
+
+    if (Visited.contains(U))
+      continue;
+
+    if (isa<Instruction>(U))
+      I = cast<Instruction>(U);
+    else
+      Tmp.insert(Tmp.end(), U->user_begin(), U->user_end());
+
+    Visited.insert(U);
+  } while (!I && !Tmp.empty());
+
+  assert(I && "thread_local global should have at least one non-constant use.");
+
+  G.getContext().diagnose(
+    DiagnosticInfoUnsupported(*I->getParent()->getParent(), W,
+                              I->getDebugLoc(), DS_Error));
+
+  return false;
+}
+
+static inline void clearModule(Module &M) { // TODO: simplify.
+  while (!M.functions().empty())
+    eraseFromModule(*M.begin());
+  while (!M.globals().empty())
+    eraseFromModule(*M.globals().begin());
+  while (!M.aliases().empty())
+    eraseFromModule(*M.aliases().begin());
+  while (!M.ifuncs().empty())
+    eraseFromModule(*M.ifuncs().begin());
+}
+
+static inline void maybeHandleGlobals(Module &M) {
+  unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+  for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
+    if (!checkIfSupported(G))
+      return clearModule(M);
+
+    if (G.isThreadLocal())
+      continue;
+    if (G.isConstant())
+      continue;
+    if (G.getAddressSpace() != GlobAS)
+      continue;
+    if (G.getLinkage() != GlobalVariable::ExternalLinkage)
+      continue;
+
+    G.setLinkage(GlobalVariable::ExternalWeakLinkage);
+    G.setExternallyInitialized(true);
+  }
+}
+
+template<unsigned N>
+static inline void removeUnreachableFunctions(
+  const SmallPtrSet<const Function *, N>& Reachable, Module &M) {
+  removeFromUsedLists(M, [&](Constant *C) {
+    if (auto F = dyn_cast<Function>(C))
+      return !Reachable.contains(F);
+
+    return false;
+  });
+
+  SmallVector<std::reference_wrapper<Function>> ToRemove;
+  copy_if(M, std::back_inserter(ToRemove), [&](auto &&F) {
+    return !F.isIntrinsic() && !Reachable.contains(&F);
+  });
+
+  for_each(ToRemove, eraseFromModule<Function>);
+}
+
+static inline bool isAcceleratorExecutionRoot(const Function *F) {
+    if (!F)
+      return false;
+
+    return F->getCallingConv() == CallingConv::AMDGPU_KERNEL;
+}
+
+static inline bool checkIfSupported(const Function *F, const CallBase *CB) {
+  const auto Dx = F->getName().rfind("__hipstdpar_unsupported");
+
+  if (Dx == StringRef::npos)
+    return true;
+
+  const auto N = F->getName().substr(0, Dx);
+
+  std::string W;
+  raw_string_ostream OS(W);
+
+  if (N == "__ASM")
+    OS << "Accelerator does not support the ASM block:\n"
+      << cast<ConstantDataArray>(CB->getArgOperand(0))->getAsCString();
+  else
+    OS << "Accelerator does not support the " << N << " function.";
+
+  auto Caller = CB->getParent()->getParent();
+
+  Caller->getContext().diagnose(
+    DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error));
+
+  return false;
+}
+
+PreservedAnalyses
+  HipStdParAcceleratorCodeSelectionPass::run(Module &M,
+                                             ModuleAnalysisManager &MAM) {
+  auto &CGA = MAM.getResult<CallGraphAnalysis>(M);
+
+  SmallPtrSet<const Function *, 32> Reachable;
+  for (auto &&CGN : CGA) {
+    if (!isAcceleratorExecutionRoot(CGN.first))
+      continue;
+
+    Reachable.insert(CGN.first);
+
+    SmallVector<const Function *> Tmp({CGN.first});
+    do {
+      auto F = std::move(Tmp.back());
+      Tmp.pop_back();
+
+      for (auto &&N : *CGA[F]) {
+        if (!N.second)
+          continue;
+        if (!N.second->getFunction())
+          continue;
+        if (Reachable.contains(N.second->getFunction()))
+          continue;
+
+        if (!checkIfSupported(N.second->getFunction(),
+                              dyn_cast<CallBase>(*N.first)))
+          return PreservedAnalyses::none();
+
+        Reachable.insert(N.second->getFunction());
+        Tmp.push_back(N.second->getFunction());
+      }
+    } while (!std::empty(Tmp));
+  }
+
+  if (std::empty(Reachable))
+    clearModule(M);
+  else
+    removeUnreachableFunctions(Reachable, M);
+
+  maybeHandleGlobals(M);
+
+  return PreservedAnalyses::none();
+}
+
+static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{
+  {"aligned_alloc",             "__hipstdpar_aligned_alloc"},
+  {"calloc",                    "__hipstdpar_calloc"},
+  {"free",                      "__hipstdpar_free"},
+  {"malloc",                    "__hipstdpar_malloc"},
+  {"memalign",                  "__hipstdpar_aligned_alloc"},
+  {"posix_memalign",            "__hipstdpar_posix_aligned_alloc"},
+  {"realloc",                   "__hipstdpar_realloc"},
+  {"reallocarray",              "__hipstdpar_realloc_array"},
+  {"_ZdaPv",                    "__hipstdpar_operator_delete"},
+  {"_ZdaPvm",                   "__hipstdpar_operator_delete_sized"},
+  {"_ZdaPvSt11align_val_t",     "__hipstdpar_operator_delete_aligned"},
+  {"_ZdaPvmSt11align_val_t",    "__hipstdpar_operator_delete_aligned_sized"},
+  {"_ZdlPv",                    "__hipstdpar_operator_delete"},
+  {"_ZdlPvm",                   "__hipstdpar_operator_delete_sized"},
+  {"_ZdlPvSt11align_val_t",     "__hipstdpar_operator_delete_aligned"},
+  {"_ZdlPvmSt11align_val_t",    "__hipstdpar_operator_delete_aligned_sized"},
+  {"_Znam",                     "__hipstdpar_operator_new"},
+  {"_ZnamRKSt9nothrow_t",       "__hipstdpar_operator_new_nothrow"},
+  {"_ZnamSt11align_val_t",      "__hipstdpar_operator_new_aligned"},
+  {"_ZnamSt11align_val_tRKSt9nothrow_t",
+                                "__hipstdpar_operator_new_aligned_nothrow"},
+
+  {"_Znwm",                     "__hipstdpar_operator_new"},
+  {"_ZnwmRKSt9nothrow_t",       "__hipstdpar_operator_new_nothrow"},
+  {"_ZnwmSt11align_val_t",      "__hipstdpar_operator_new_aligned"},
+  {"_ZnwmSt11align_val_tRKSt9nothrow_t",
+                                "__hipstdpar_operator_new_aligned_nothrow"},
+  {"__builtin_calloc",          "__hipstdpar_calloc"},
+  {"__builtin_free",            "__hipstdpar_free"},
+  {"__builtin_malloc",          "__hipstdpar_malloc"},
+  {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
+  {"__builtin_operator_new",    "__hipstdpar_operator_new"},
+  {"__builtin_realloc",         "__hipstdpar_realloc"},
+  {"__libc_calloc",             "__hipstdpar_calloc"},
+  {"__libc_free",               "__hipstdpar_free"},
+  {"__libc_malloc",             "__hipstdpar_malloc"},
+  {"__libc_memalign",           "__hipstdpar_aligned_alloc"},
+  {"__libc_realloc",            "__hipstdpar_realloc"}
+};
+
+PreservedAnalyses
+HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
+  SmallDenseMap<StringRef, StringRef> AllocReplacements(std::cbegin(ReplaceMap),
+                                                        std::cend(ReplaceMap));
+
+  for (auto &&F : M) {
+    if (!F.hasName())
+      continue;
+    if (!AllocReplacements.contains(F.getName()))
+      continue;
+
+    if (auto R = M.getFunction(AllocReplacements[F.getName()])) {
+      F.replaceAllUsesWith(R);
+    } else {
+      std::string W;
+      raw_string_ostream OS(W);
+
+      OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()]
+        << ". Tried to run the allocation interposition pass without the "
+        << "replacement functions available.";
+
+      F.getContext().diagnose(DiagnosticInfoUnsupported(F, W,
+                                                        F.getSubprogram(),
+                                                        DS_Warning));
+    }
+  }
+
+  if (auto F = M.getFunction("__hipstdpar_hidden_free")) {
+    auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(),
+                                          F->getAttributes());
+    F->replaceAllUsesWith(LibcFree.getCallee());
+
+    eraseFromModule(*F);
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/test/Transforms/HipStdPar/accelerator-code-selection.ll b/llvm/test/Transforms/HipStdPar/accelerator-code-selection.ll
new file mode 100644
index 000000000000000..2d41bd9b3443515
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/accelerator-code-selection.ll
@@ -0,0 +1,116 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s | FileCheck %s
+
+$_ZNK8CallableclEPi = comdat any
+$_ZNK8CallableclEPf = comdat any
+$_ZNK8Callable6mem_fnEPKi = comdat any
+$_ZN8Callable13static_mem_fnEPKi = comdat any
+; CHECK-NOT: $_ZNK8Callable37another_mem_fn_which_will_get_removedEPKf
+$_ZNK8Callable37another_mem_fn_which_will_get_removedEPKf = comdat any
+; CHECK-NOT: $_ZN8Callable44another_static_mem_fn_which_will_get_removedEPKf
+$_ZN8Callable44another_static_mem_fn_which_will_get_removedEPKf = comdat any
+
+%struct.Callable = type { [64 x i8] }
+
+; CHECK-NOT: @should_be_removed
+ at llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr @should_be_removed], section "llvm.metadata"
+
+define void @should_be_removed(ptr %p) {
+  ret void
+}
+
+declare void @llvm.trap()
+
+; CHECK: define {{.*}} @called_via_chain
+define void @called_via_chain(ptr %p) {
+  entry:
+    %tobool.not = icmp eq ptr %p, null
+    br i1 %tobool.not, label %if.then, label %if.end
+
+  if.then:
+    tail call void @llvm.trap()
+    unreachable
+
+  if.end:
+    ret void
+}
+
+; CHECK: define {{.*}} @directly_called
+define void @directly_called(ptr %p) {
+  tail call void @called_via_chain(ptr %p)
+  ret void
+}
+
+; CHECK: define {{.*}} amdgpu_kernel {{.*}} @accelerator_execution_root
+define hidden amdgpu_kernel void @accelerator_execution_root(ptr %p) {
+  tail call void @directly_called(ptr %p)
+  ret void
+}
+
+; CHECK-NOT: @defined_elsewhere_should_be_removed
+declare void @defined_elsewhere_should_be_removed(ptr)
+
+; CHECK: declare {{.*}} @defined_elsewhere_directly_called
+declare void @defined_elsewhere_directly_called(ptr)
+
+; CHECK: define {{.*}} amdgpu_kernel {{.*}} @another_accelerator_execution_root
+define hidden amdgpu_kernel void @another_accelerator_execution_root(ptr %p) {
+  tail call void @defined_elsewhere_directly_called(ptr %p)
+  ret void
+}
+
+; Also test passing a callable object (functor / lambda) to a kernel, which is
+; the common pattern for customising algorithms.
+
+; CHECK: define {{.*}} amdgpu_kernel {{.*}} @_Z22accelerator_execution_root_taking_callablePi8Callable
+define hidden amdgpu_kernel void @_Z22accelerator_execution_root_taking_callablePi8Callable(ptr noundef %p, ptr addrspace(4) nocapture readonly byref(%struct.Callable) align 8 %callable) {
+  %callable_in_generic = addrspacecast ptr addrspace(4) %callable to ptr
+  call void @_ZNK8CallableclEPi(ptr noundef nonnull align 1 dereferenceable(64) %callable_in_generic, ptr noundef %p)
+
+  ret void
+}
+
+; CHECK: define {{.*}} @_ZNK8CallableclEPi
+define linkonce_odr dso_local void @_ZNK8CallableclEPi(ptr noundef nonnull align 1 dereferenceable(64) %this, ptr noundef %p) {
+  call void @_ZNK8Callable6mem_fnEPKi(ptr noundef nonnull align 1 dereferenceable(1) %this, ptr noundef %p)
+
+  ret void
+}
+
+; CHECK: define {{.*}} @_ZNK8Callable6mem_fnEPKi
+define linkonce_odr dso_local void @_ZNK8Callable6mem_fnEPKi(ptr noundef nonnull align 1 dereferenceable(1) %this, ptr noundef %p) {
+  call void @_ZN8Callable13static_mem_fnEPKi(ptr noundef %p)
+
+  ret void
+}
+
+; CHECK: define {{.*}} @_ZN8Callable13static_mem_fnEPKi
+define linkonce_odr dso_local void @_ZN8Callable13static_mem_fnEPKi(ptr noundef %p) {
+  ret void
+}
+
+; CHECK-NOT: define {{.*}} @_Z26non_kernel_taking_callablePf8Callable
+define dso_local void @_Z26non_kernel_taking_callablePf8Callable(ptr noundef %p, ptr noundef byval(%struct.Callable) align 8 %callable) {
+  call void @_ZNK8CallableclEPf(ptr noundef nonnull align 1 dereferenceable(64) %callable, ptr noundef %p)
+
+  ret void
+}
+
+; CHECK-NOT: define {{.*}} @_ZNK8CallableclEPf
+define linkonce_odr dso_local void @_ZNK8CallableclEPf(ptr noundef nonnull align 1 dereferenceable(64) %this, ptr noundef %p) {
+  call void @_ZNK8Callable37another_mem_fn_which_will_get_removedEPKf(ptr noundef nonnull align 1 dereferenceable(64) %this, ptr noundef %p)
+
+  ret void
+}
+
+; CHECK-NOT: @_ZNK8Callable37another_mem_fn_which_will_get_removedEPKf
+define linkonce_odr dso_local void @_ZNK8Callable37another_mem_fn_which_will_get_removedEPKf(ptr noundef nonnull align 1 dereferenceable(64) %this, ptr noundef %p) {
+  call void @_ZN8Callable44another_static_mem_fn_which_will_get_removedEPKf(ptr noundef %p)
+
+  ret void
+}
+
+; CHECK-NOT: @_ZN8Callable44another_static_mem_fn_which_will_get_removedEPKf
+define linkonce_odr dso_local void @_ZN8Callable44another_static_mem_fn_which_will_get_removedEPKf(ptr noundef %p) {
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/Transforms/HipStdPar/allocation-interposition.ll b/llvm/test/Transforms/HipStdPar/allocation-interposition.ll
new file mode 100644
index 000000000000000..291b06ed0ca9edc
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/allocation-interposition.ll
@@ -0,0 +1,221 @@
+; RUN: opt -S -passes=hipstdpar-interpose-alloc %s | FileCheck %s
+
+%"struct.std::nothrow_t" = type { i8 }
+
+ at _ZSt7nothrow = external global %"struct.std::nothrow_t", align 1
+
+declare ptr @__hipstdpar_aligned_alloc(i64, i64)
+
+declare ptr @__hipstdpar_malloc(i64)
+
+declare ptr @__hipstdpar_calloc(i64, i64)
+
+declare i32 @__hipstdpar_posix_aligned_alloc(ptr, i64, i64)
+
+declare void @__hipstdpar_hidden_free(ptr)
+
+declare ptr @__hipstdpar_realloc(ptr, i64)
+
+declare ptr @__hipstdpar_realloc_array(ptr, i64, i64)
+
+declare void @__hipstdpar_free(ptr)
+
+declare ptr @__hipstdpar_operator_new_aligned(i64, i64)
+
+declare ptr @__hipstdpar_operator_new(i64)
+
+declare ptr @__hipstdpar_operator_new_nothrow(i64, %"struct.std::nothrow_t")
+
+declare ptr @__hipstdpar_operator_new_aligned_nothrow(i64, i64, %"struct.std::nothrow_t")
+
+declare void @__hipstdpar_operator_delete_aligned_sized(ptr, i64, i64)
+
+declare void @__hipstdpar_operator_delete(ptr)
+
+declare void @__hipstdpar_operator_delete_aligned(ptr, i64)
+
+declare void @__hipstdpar_operator_delete_sized(ptr, i64)
+
+define dso_local noundef i32 @allocs() {
+  ; CHECK: %1 = call noalias align 8 ptr @__hipstdpar_aligned_alloc(i64 noundef 8, i64 noundef 42)
+  %1 = call noalias align 8 ptr @aligned_alloc(i64 noundef 8, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %1)
+  call void @free(ptr noundef %1)
+
+  ; CHECK: %2 = call noalias ptr @__hipstdpar_calloc(i64 noundef 1, i64 noundef 42)
+  %2 = call noalias ptr @calloc(i64 noundef 1, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %2)
+  call void @free(ptr noundef %2)
+
+  ; CHECK: %3 = call noalias ptr @__hipstdpar_malloc(i64 noundef 42)
+  %3 = call noalias ptr @malloc(i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %3)
+  call void @free(ptr noundef %3)
+
+  ; CHECK: %4 = call noalias align 8 ptr @__hipstdpar_aligned_alloc(i64 noundef 8, i64 noundef 42)
+  %4 = call noalias align 8 ptr @memalign(i64 noundef 8, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %4)
+  call void @free(ptr noundef %4)
+
+  %tmp = alloca ptr, align 8
+  ; CHECK: %5 = call i32 @__hipstdpar_posix_aligned_alloc(ptr noundef %tmp, i64 noundef 8, i64 noundef 42)
+  %5 = call i32 @posix_memalign(ptr noundef %tmp, i64 noundef 8, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %tmp)
+  call void @free(ptr noundef %tmp)
+
+  ; CHECK: %6 = call noalias ptr @__hipstdpar_malloc(i64 noundef 42)
+  %6 = call noalias ptr @malloc(i64 noundef 42)
+  ; CHECK: %7 = call ptr @__hipstdpar_realloc(ptr noundef %6, i64 noundef 42)
+  %7 = call ptr @realloc(ptr noundef %6, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %7)
+  call void @free(ptr noundef %7)
+
+  ; CHECK: %8 = call noalias ptr @__hipstdpar_calloc(i64 noundef 1, i64 noundef 42)
+  %8 = call noalias ptr @calloc(i64 noundef 1, i64 noundef 42)
+  ; CHECK: %9 = call ptr @__hipstdpar_realloc_array(ptr noundef %8, i64 noundef 1, i64 noundef 42)
+  %9 = call ptr @reallocarray(ptr noundef %8, i64 noundef 1, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %9)
+  call void @free(ptr noundef %9)
+
+  ; CHECK: %10 = call noalias noundef nonnull ptr @__hipstdpar_operator_new(i64 noundef 1)
+  %10 = call noalias noundef nonnull ptr @_Znwm(i64 noundef 1)
+  ; CHECK: call void @__hipstdpar_operator_delete(ptr noundef %10)
+  call void @_ZdlPv(ptr noundef %10)
+
+  ; CHECK: %11 = call noalias noundef nonnull align 8 ptr @__hipstdpar_operator_new_aligned(i64 noundef 1, i64 noundef 8)
+  %11 = call noalias noundef nonnull align 8 ptr @_ZnwmSt11align_val_t(i64 noundef 1, i64 noundef 8)
+  ; CHECK: call void @__hipstdpar_operator_delete_aligned(ptr noundef %11, i64 noundef 8)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %11, i64 noundef 8)
+
+  ; CHECK: %12 = call noalias noundef ptr @__hipstdpar_operator_new_nothrow(i64 noundef 1, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  %12 = call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 1, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  ; CHECK: call void @__hipstdpar_operator_delete(ptr noundef %12)
+  call void @_ZdlPv(ptr noundef %12)
+
+  ; CHECK: %13 = call noalias noundef align 8 ptr @__hipstdpar_operator_new_aligned_nothrow(i64 noundef 1, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  %13 = call noalias noundef align 8 ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 noundef 1, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  ; CHECK: call void @__hipstdpar_operator_delete_aligned(ptr noundef %13, i64 noundef 8)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %13, i64 noundef 8)
+
+  ; CHECK: %14 = call noalias noundef nonnull ptr @__hipstdpar_operator_new(i64 noundef 42)
+  %14 = call noalias noundef nonnull ptr @_Znam(i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_operator_delete(ptr noundef %14)
+  call void @_ZdaPv(ptr noundef %14)
+
+  ; CHECK: %15 = call noalias noundef nonnull align 8 ptr @__hipstdpar_operator_new_aligned(i64 noundef 42, i64 noundef 8)
+  %15 = call noalias noundef nonnull align 8 ptr @_ZnamSt11align_val_t(i64 noundef 42, i64 noundef 8)
+  ; CHECK: call void @__hipstdpar_operator_delete_aligned(ptr noundef %15, i64 noundef 8)
+  call void @_ZdaPvSt11align_val_t(ptr noundef %15, i64 noundef 8)
+
+  ; CHECK:  %16 = call noalias noundef ptr @__hipstdpar_operator_new_nothrow(i64 noundef 42, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  %16 = call noalias noundef ptr @_ZnamRKSt9nothrow_t(i64 noundef 42, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  ; CHECK: call void @__hipstdpar_operator_delete(ptr noundef %16)
+  call void @_ZdaPv(ptr noundef %16)
+
+  ; CHECK: %17 = call noalias noundef align 8 ptr @__hipstdpar_operator_new_aligned_nothrow(i64 noundef 42, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  %17 = call noalias noundef align 8 ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 noundef 42, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  ; CHECK: call void @__hipstdpar_operator_delete_aligned(ptr noundef %17, i64 noundef 8)
+  call void @_ZdaPvSt11align_val_t(ptr noundef %17, i64 noundef 8)
+
+  ; CHECK:  %18 = call ptr @__hipstdpar_calloc(i64 noundef 1, i64 noundef 42)
+  %18 = call ptr @calloc(i64 noundef 1, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %18)
+  call void @free(ptr noundef %18)
+
+  ; CHECK: %19 = call ptr @__hipstdpar_malloc(i64 noundef 42)
+  %19 = call ptr @malloc(i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %19)
+  call void @free(ptr noundef %19)
+
+  ; CHECK: %20 = call noalias noundef nonnull ptr @__hipstdpar_operator_new(i64 noundef 42)
+  %20 = call noalias noundef nonnull ptr @_Znwm(i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_operator_delete(ptr noundef %20)
+  call void @_ZdlPv(ptr noundef %20)
+
+  ; CHECK:  %21 = call noalias noundef nonnull align 8 ptr @__hipstdpar_operator_new_aligned(i64 noundef 42, i64 noundef 8)
+  %21 = call noalias noundef nonnull align 8 ptr @_ZnwmSt11align_val_t(i64 noundef 42, i64 noundef 8)
+  ; CHECK: call void @__hipstdpar_operator_delete_aligned(ptr noundef %21, i64 noundef 8)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %21, i64 noundef 8)
+
+  ; CHECK: %22 = call noalias noundef ptr @__hipstdpar_operator_new_nothrow(i64 noundef 42, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  %22 = call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 42, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  ; CHECK: call void @__hipstdpar_operator_delete(ptr noundef %22)
+  call void @_ZdlPv(ptr noundef %22)
+
+  ; CHECK:  %23 = call noalias noundef align 8 ptr @__hipstdpar_operator_new_aligned_nothrow(i64 noundef 42, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  %23 = call noalias noundef align 8 ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 noundef 42, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  ; CHECK: call void @__hipstdpar_operator_delete_aligned(ptr noundef %23, i64 noundef 8)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %23, i64 noundef 8)
+
+  ; CHECK: %24 = call ptr @__hipstdpar_malloc(i64 noundef 42)
+  %24 = call ptr @malloc(i64 noundef 42)
+  ; CHECK: %25 = call ptr @__hipstdpar_realloc(ptr noundef %24, i64 noundef 41)
+  %25 = call ptr @realloc(ptr noundef %24, i64 noundef 41)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %25)
+  call void @free(ptr noundef %25)
+
+  ; CHECK: %26 = call ptr @__hipstdpar_calloc(i64 noundef 1, i64 noundef 42)
+  %26 = call ptr @__libc_calloc(i64 noundef 1, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %26)
+  call void @__libc_free(ptr noundef %26)
+
+  ; CHECK: %27 = call ptr @__hipstdpar_malloc(i64 noundef 42)
+  %27 = call ptr @__libc_malloc(i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %27)
+  call void @__libc_free(ptr noundef %27)
+
+  ; CHECK: %28 = call ptr @__hipstdpar_aligned_alloc(i64 noundef 8, i64 noundef 42)
+  %28 = call ptr @__libc_memalign(i64 noundef 8, i64 noundef 42)
+  ; CHECK: call void @__hipstdpar_free(ptr noundef %28)
+  call void @__libc_free(ptr noundef %28)
+
+  ret i32 0
+}
+
+declare noalias ptr @aligned_alloc(i64 noundef, i64 noundef)
+
+declare void @free(ptr noundef)
+
+declare noalias ptr @calloc(i64 noundef, i64 noundef)
+
+declare noalias ptr @malloc(i64 noundef)
+
+declare noalias ptr @memalign(i64 noundef, i64 noundef)
+
+declare i32 @posix_memalign(ptr noundef, i64 noundef, i64 noundef)
+
+declare ptr @realloc(ptr noundef, i64 noundef)
+
+declare ptr @reallocarray(ptr noundef, i64 noundef, i64 noundef)
+
+declare noundef nonnull ptr @_Znwm(i64 noundef)
+
+declare void @_ZdlPv(ptr noundef)
+
+declare noalias noundef nonnull ptr @_ZnwmSt11align_val_t(i64 noundef, i64 noundef)
+
+declare void @_ZdlPvSt11align_val_t(ptr noundef, i64 noundef)
+
+declare noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare noalias noundef ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 noundef, i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare noundef nonnull ptr @_Znam(i64 noundef)
+
+declare void @_ZdaPv(ptr noundef)
+
+declare noalias noundef nonnull ptr @_ZnamSt11align_val_t(i64 noundef, i64 noundef)
+
+declare void @_ZdaPvSt11align_val_t(ptr noundef, i64 noundef)
+
+declare noalias noundef ptr @_ZnamRKSt9nothrow_t(i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare noalias noundef ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 noundef, i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare ptr @__libc_calloc(i64 noundef, i64 noundef)
+
+declare void @__libc_free(ptr noundef)
+
+declare ptr @__libc_malloc(i64 noundef)
+
+declare ptr @__libc_memalign(i64 noundef, i64 noundef)
\ No newline at end of file
diff --git a/llvm/test/Transforms/HipStdPar/allocation-no-interposition.ll b/llvm/test/Transforms/HipStdPar/allocation-no-interposition.ll
new file mode 100644
index 000000000000000..15640c6ae94e0d8
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/allocation-no-interposition.ll
@@ -0,0 +1,161 @@
+; RUN: opt < %s -passes=hipstdpar-interpose-alloc -S 2>&1 | FileCheck %s
+
+; CHECK: warning: {{.*}} aligned_alloc {{.*}} cannot be interposed, missing: __hipstdpar_aligned_alloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} free {{.*}} cannot be interposed, missing: __hipstdpar_free. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} calloc {{.*}} cannot be interposed, missing: __hipstdpar_calloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} malloc {{.*}} cannot be interposed, missing: __hipstdpar_malloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} memalign {{.*}} cannot be interposed, missing: __hipstdpar_aligned_alloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} posix_memalign {{.*}} cannot be interposed, missing: __hipstdpar_posix_aligned_alloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} realloc {{.*}} cannot be interposed, missing: __hipstdpar_realloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} reallocarray {{.*}} cannot be interposed, missing: __hipstdpar_realloc_array. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _Znwm {{.*}} cannot be interposed, missing: __hipstdpar_operator_new. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZdlPv {{.*}} cannot be interposed, missing: __hipstdpar_operator_delete. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZnwmSt11align_val_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_new_aligned. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZdlPvSt11align_val_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_delete_aligned. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZnwmRKSt9nothrow_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_new_nothrow. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZnwmSt11align_val_tRKSt9nothrow_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_new_aligned_nothrow. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _Znam {{.*}} cannot be interposed, missing: __hipstdpar_operator_new. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZdaPv {{.*}} cannot be interposed, missing: __hipstdpar_operator_delete. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZnamSt11align_val_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_new_aligned. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZdaPvSt11align_val_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_delete_aligned. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZnamRKSt9nothrow_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_new_nothrow. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} _ZnamSt11align_val_tRKSt9nothrow_t {{.*}} cannot be interposed, missing: __hipstdpar_operator_new_aligned_nothrow. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} __libc_calloc {{.*}} cannot be interposed, missing: __hipstdpar_calloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} __libc_free {{.*}} cannot be interposed, missing: __hipstdpar_free. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} __libc_malloc {{.*}} cannot be interposed, missing: __hipstdpar_malloc. Tried to run the allocation interposition pass without the replacement functions available.
+; CHECK: warning: {{.*}} __libc_memalign {{.*}} cannot be interposed, missing: __hipstdpar_aligned_alloc. Tried to run the allocation interposition pass without the replacement functions available.
+
+%"struct.std::nothrow_t" = type { i8 }
+
+ at _ZSt7nothrow = external global %"struct.std::nothrow_t", align 1
+
+define dso_local noundef i32 @allocs() {
+  %1 = call noalias align 8 ptr @aligned_alloc(i64 noundef 8, i64 noundef 42)
+  call void @free(ptr noundef %1)
+
+  %2 = call noalias ptr @calloc(i64 noundef 1, i64 noundef 42)
+  call void @free(ptr noundef %2)
+
+  %3 = call noalias ptr @malloc(i64 noundef 42)
+  call void @free(ptr noundef %3)
+
+  %4 = call noalias align 8 ptr @memalign(i64 noundef 8, i64 noundef 42)
+  call void @free(ptr noundef %4)
+
+  %tmp = alloca ptr, align 8
+  %5 = call i32 @posix_memalign(ptr noundef %tmp, i64 noundef 8, i64 noundef 42)
+  call void @free(ptr noundef %tmp)
+
+  %6 = call noalias ptr @malloc(i64 noundef 42)
+  %7 = call ptr @realloc(ptr noundef %6, i64 noundef 42)
+  call void @free(ptr noundef %7)
+
+  %8 = call noalias ptr @calloc(i64 noundef 1, i64 noundef 42)
+  %9 = call ptr @reallocarray(ptr noundef %8, i64 noundef 1, i64 noundef 42)
+  call void @free(ptr noundef %9)
+
+  %10 = call noalias noundef nonnull ptr @_Znwm(i64 noundef 1)
+  call void @_ZdlPv(ptr noundef %10)
+
+  %11 = call noalias noundef nonnull align 8 ptr @_ZnwmSt11align_val_t(i64 noundef 1, i64 noundef 8)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %11, i64 noundef 8)
+
+  %12 = call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 1, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  call void @_ZdlPv(ptr noundef %12)
+
+  %13 = call noalias noundef align 8 ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 noundef 1, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %13, i64 noundef 8)
+
+  %14 = call noalias noundef nonnull ptr @_Znam(i64 noundef 42)
+  call void @_ZdaPv(ptr noundef %14)
+
+  %15 = call noalias noundef nonnull align 8 ptr @_ZnamSt11align_val_t(i64 noundef 42, i64 noundef 8)
+  call void @_ZdaPvSt11align_val_t(ptr noundef %15, i64 noundef 8)
+
+  %16 = call noalias noundef ptr @_ZnamRKSt9nothrow_t(i64 noundef 42, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  call void @_ZdaPv(ptr noundef %16)
+
+  %17 = call noalias noundef align 8 ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 noundef 42, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  call void @_ZdaPvSt11align_val_t(ptr noundef %17, i64 noundef 8)
+
+  %18 = call ptr @calloc(i64 noundef 1, i64 noundef 42)
+  call void @free(ptr noundef %18)
+
+  %19 = call ptr @malloc(i64 noundef 42)
+  call void @free(ptr noundef %19)
+
+  %20 = call noalias noundef nonnull ptr @_Znwm(i64 noundef 42)
+  call void @_ZdlPv(ptr noundef %20)
+
+  %21 = call noalias noundef nonnull align 8 ptr @_ZnwmSt11align_val_t(i64 noundef 42, i64 noundef 8)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %21, i64 noundef 8)
+
+  %22 = call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 42, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  call void @_ZdlPv(ptr noundef %22)
+
+  %23 = call noalias noundef align 8 ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 noundef 42, i64 noundef 8, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+  call void @_ZdlPvSt11align_val_t(ptr noundef %23, i64 noundef 8)
+
+  %24 = call ptr @malloc(i64 noundef 42)
+  %25 = call ptr @realloc(ptr noundef %24, i64 noundef 41)
+  call void @free(ptr noundef %25)
+
+  %26 = call ptr @__libc_calloc(i64 noundef 1, i64 noundef 42)
+  call void @__libc_free(ptr noundef %26)
+
+  %27 = call ptr @__libc_malloc(i64 noundef 42)
+  call void @__libc_free(ptr noundef %27)
+
+  %28 = call ptr @__libc_memalign(i64 noundef 8, i64 noundef 42)
+  call void @__libc_free(ptr noundef %28)
+
+  ret i32 0
+}
+
+declare noalias ptr @aligned_alloc(i64 noundef, i64 noundef)
+
+declare void @free(ptr noundef)
+
+declare noalias ptr @calloc(i64 noundef, i64 noundef)
+
+declare noalias ptr @malloc(i64 noundef)
+
+declare noalias ptr @memalign(i64 noundef, i64 noundef)
+
+declare i32 @posix_memalign(ptr noundef, i64 noundef, i64 noundef)
+
+declare ptr @realloc(ptr noundef, i64 noundef)
+
+declare ptr @reallocarray(ptr noundef, i64 noundef, i64 noundef)
+
+declare noundef nonnull ptr @_Znwm(i64 noundef)
+
+declare void @_ZdlPv(ptr noundef)
+
+declare noalias noundef nonnull ptr @_ZnwmSt11align_val_t(i64 noundef, i64 noundef)
+
+declare void @_ZdlPvSt11align_val_t(ptr noundef, i64 noundef)
+
+declare noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare noalias noundef ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 noundef, i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare noundef nonnull ptr @_Znam(i64 noundef)
+
+declare void @_ZdaPv(ptr noundef)
+
+declare noalias noundef nonnull ptr @_ZnamSt11align_val_t(i64 noundef, i64 noundef)
+
+declare void @_ZdaPvSt11align_val_t(ptr noundef, i64 noundef)
+
+declare noalias noundef ptr @_ZnamRKSt9nothrow_t(i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare noalias noundef ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 noundef, i64 noundef, ptr noundef nonnull align 1 dereferenceable(1))
+
+declare ptr @__libc_calloc(i64 noundef, i64 noundef)
+
+declare void @__libc_free(ptr noundef)
+
+declare ptr @__libc_malloc(i64 noundef)
+
+declare ptr @__libc_memalign(i64 noundef, i64 noundef)
\ No newline at end of file
diff --git a/llvm/test/Transforms/HipStdPar/unsupported-asm.ll b/llvm/test/Transforms/HipStdPar/unsupported-asm.ll
new file mode 100644
index 000000000000000..19e6b28cc91642d
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/unsupported-asm.ll
@@ -0,0 +1,12 @@
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: {{.*}} in function foo void (): Accelerator does not support the ASM block:
+; CHECK-NEXT: {{.*}}Invalid ASM block{{.*}}
+define amdgpu_kernel void @foo() {
+entry:
+  call void @__ASM__hipstdpar_unsupported([18 x i8] c"Invalid ASM block\00")
+  ret void
+}
+
+declare void @__ASM__hipstdpar_unsupported([18 x i8])
\ No newline at end of file
diff --git a/llvm/test/Transforms/HipStdPar/unsupported-builtins.ll b/llvm/test/Transforms/HipStdPar/unsupported-builtins.ll
new file mode 100644
index 000000000000000..19490dad80bab1b
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/unsupported-builtins.ll
@@ -0,0 +1,11 @@
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s 2>&1 | FileCheck %s
+
+; CHECK: error: {{.*}} in function foo void (): Accelerator does not support the __builtin_ia32_pause function
+define amdgpu_kernel void @foo() {
+entry:
+  call void @__builtin_ia32_pause__hipstdpar_unsupported()
+  ret void
+}
+
+declare void @__builtin_ia32_pause__hipstdpar_unsupported()
\ No newline at end of file
diff --git a/llvm/test/Transforms/HipStdPar/unsupported-thread-local-direct-use.ll b/llvm/test/Transforms/HipStdPar/unsupported-thread-local-direct-use.ll
new file mode 100644
index 000000000000000..fa17bb114eae600
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/unsupported-thread-local-direct-use.ll
@@ -0,0 +1,14 @@
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN:   %s 2>&1 | FileCheck %s
+
+ at tls = hidden thread_local addrspace(1) global i32 0, align 4
+
+; CHECK: error: {{.*}} in function direct_use void (): Accelerator does not support the thread_local variable tls
+define amdgpu_kernel void @direct_use() {
+entry:
+  %0 = call align 4 ptr addrspace(1) @llvm.threadlocal.address.p1(ptr addrspace(1) @tls)
+  %1 = load i32, ptr addrspace(1) %0, align 4
+  ret void
+}
+
+declare nonnull ptr addrspace(1) @llvm.threadlocal.address.p1(ptr addrspace(1) nonnull)
diff --git a/llvm/test/Transforms/HipStdPar/unsupported-thread-local-indirect-use.ll b/llvm/test/Transforms/HipStdPar/unsupported-thread-local-indirect-use.ll
new file mode 100644
index 000000000000000..40014853d8ac526
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/unsupported-thread-local-indirect-use.ll
@@ -0,0 +1,14 @@
+; RUN: not opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN:   %s 2>&1 | FileCheck %s
+
+ at tls = hidden thread_local addrspace(1) global i32 0, align 4
+
+; CHECK: error: {{.*}} in function indirect_use void (): Accelerator does not support the thread_local variable tls
+define amdgpu_kernel void @indirect_use() {
+entry:
+  %0 = call align 4 ptr @llvm.threadlocal.address.p0(ptr addrspacecast (ptr addrspace(1) @tls to ptr))
+  %1 = load i32, ptr %0, align 4
+  ret void
+}
+
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)

>From 6cf41ada44c812cbd58f5907f15df8a8ce1f3a74 Mon Sep 17 00:00:00 2001
From: Yusra Syeda <99052248+ysyeda at users.noreply.github.com>
Date: Thu, 12 Oct 2023 06:42:55 -0400
Subject: [PATCH 05/10] [SystemZ][z/OS] Add vararg support to z/OS (#68834)

This PR adds vararg support to z/OS and updates the call-zos-vararg.ll
lit test.

Co-authored-by: Yusra Syeda <yusra.syeda at ibm.com>
---
 .../Target/SystemZ/SystemZFrameLowering.cpp   |  24 ++
 .../Target/SystemZ/SystemZISelLowering.cpp    |  18 +-
 llvm/test/CodeGen/SystemZ/call-zos-vararg.ll  | 267 ++++++++++++------
 3 files changed, 228 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 75a37060a8d06af..bfd31709eb3e0bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -1275,6 +1275,30 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
     for (MachineBasicBlock &B : llvm::drop_begin(MF))
       B.addLiveIn(Regs.getFramePointerRegister());
   }
+
+  // Save GPRs used for varargs, if any.
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  bool IsVarArg = MF.getFunction().isVarArg();
+
+  if (IsVarArg) {
+    // FixedRegs is the number of used registers, accounting for shadow
+    // registers.
+    unsigned FixedRegs = ZFI->getVarArgsFirstGPR() + ZFI->getVarArgsFirstFPR();
+    auto &GPRs = SystemZ::XPLINK64ArgGPRs;
+    for (unsigned I = FixedRegs; I < SystemZ::XPLINK64NumArgGPRs; I++) {
+      uint64_t StartOffset = MFFrame.getOffsetAdjustment() +
+                             MFFrame.getStackSize() + Regs.getCallFrameSize() +
+                             getOffsetOfLocalArea() + I * 8;
+      unsigned Reg = GPRs[I];
+      BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STG))
+          .addReg(Reg)
+          .addReg(Regs.getStackPointerRegister())
+          .addImm(StartOffset)
+          .addReg(0);
+      if (!MBB.isLiveIn(Reg))
+        MBB.addLiveIn(Reg);
+    }
+  }
 }
 
 void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 5c5cb964bd28db8..f88bd9b45aee601 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1613,7 +1613,23 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
       InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
   }
 
-  // FIXME: Add support for lowering varargs for XPLINK64 in a later patch.
+  if (IsVarArg && Subtarget.isTargetXPLINK64()) {
+    // Save the number of non-varargs registers for later use by va_start, etc.
+    FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
+    FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
+
+    auto *Regs = static_cast<SystemZXPLINK64Registers *>(
+        Subtarget.getSpecialRegisters());
+
+    // Likewise the address (in the form of a frame index) of where the
+    // first stack vararg would be.  The 1-byte size here is arbitrary.
+    // FIXME: Pre-include call frame size in the offset, should not
+    // need to manually add it here.
+    int64_t VarArgOffset = CCInfo.getStackSize() + Regs->getCallFrameSize();
+    int FI = MFI.CreateFixedObject(1, VarArgOffset, true);
+    FuncInfo->setVarArgsFrameIndex(FI);
+  }
+
   if (IsVarArg && Subtarget.isTargetELF()) {
     // Save the number of non-varargs registers for later use by va_start, etc.
     FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
diff --git a/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll b/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
index ac157e5fa67f62c..bde59a6be782277 100644
--- a/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
+++ b/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
@@ -1,88 +1,149 @@
 ; Test passing variable argument lists in 64-bit calls on z/OS.
 ; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z10 | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z14 | FileCheck %s -check-prefix=ARCH12
-; CHECK-LABEL: call_vararg_double0
-; CHECK:       llihf 3, 1074118262
-; CHECK-NEXT:  oilf  3, 3367254360
-; CHECK:       lghi  1, 1
-; CHECK:       lghi  2, 2
+; CHECK-LABEL: call_vararg_double0:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 8(5)
+; CHECK-NEXT:    lg 5, 0(5)
+; CHECK-NEXT:    llihf 3, 1074118262
+; CHECK-NEXT:    oilf 3, 3367254360
+; CHECK-NEXT:    lghi 1, 1
+; CHECK-NEXT:    lghi 2, 2
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_double0() {
 entry:
   %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, double 2.718000e+00)
   ret i64 %retval
 }
 
-; CHECK-LABEL:  call_vararg_double1
-; CHECK:        llihf 0, 1074118262
-; CHECK-NEXT:   oilf  0, 3367254360
-; CHECK:        llihf 3, 1074340036
-; CHECK-NEXT:   oilf  3, 2611340116
-; CHECK:        lghi  1, 1
-; CHECK:        lghi  2, 2
-; CHECK:        stg 0, 2200(4)
+; CHECK-LABEL: call_vararg_double1:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    llihf 0, 1074118262
+; CHECK-NEXT:    oilf 0, 3367254360
+; CHECK-NEXT:    lg 6, 8(5)
+; CHECK-NEXT:    lg 5, 0(5)
+; CHECK-NEXT:    llihf 3, 1074340036
+; CHECK-NEXT:    oilf 3, 2611340116
+; CHECK-NEXT:    lghi 1, 1
+; CHECK-NEXT:    lghi 2, 2
+; CHECK-NEXT:    stg 0, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_double1() {
 entry:
   %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, double 3.141000e+00, double 2.718000e+00)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_double2
-; CHECK-NOT:   llihf 0
-; CHECK-NOT:   oilf 0
-; CHECK:       llihf 2, 1074118262
-; CHECK-NEXT:  oilf  2, 3367254360
-; CHECK:       lghi  1, 8200
+; CHECK-LABEL: call_vararg_double2:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 24(5)
+; CHECK-NEXT:    lg 5, 16(5)
+; CHECK-NEXT:    llihf 2, 1074118262
+; CHECK-NEXT:    oilf 2, 3367254360
+; CHECK-NEXT:    lghi 1, 8200
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_double2() {
 entry:
   %retval = call i64 (i64, ...) @pass_vararg2(i64 8200, double 2.718000e+00)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_double3
-; CHECK:       llihf   0, 1072703839
-; CHECK-NEXT:  oilf    0, 2861204133
-; CHECK:       llihf   1, 1074118262
-; CHECK-NEXT:  oilf    1, 3367254360
-; CHECK:       llihf   2, 1074340036
-; CHECK-NEXT:  oilf    2, 2611340116
-; CHECK:       llihf   3, 1073127358
-; CHECK-NEXT:  oilf    3, 1992864825
-; CHECK:       stg     0, 2200(4)
+; CHECK-LABEL: call_vararg_double3:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    llihf 0, 1072703839
+; CHECK-NEXT:    oilf 0, 2861204133
+; CHECK-NEXT:    lg 6, 40(5)
+; CHECK-NEXT:    lg 5, 32(5)
+; CHECK-NEXT:    llihf 1, 1074118262
+; CHECK-NEXT:    oilf 1, 3367254360
+; CHECK-NEXT:    llihf 2, 1074340036
+; CHECK-NEXT:    oilf 2, 2611340116
+; CHECK-NEXT:    llihf 3, 1073127358
+; CHECK-NEXT:    oilf 3, 1992864825
+; CHECK-NEXT:    stg 0, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_double3() {
 entry:
   %retval = call i64 (...) @pass_vararg3(double 2.718000e+00, double 3.141000e+00, double 1.414000e+00, double 1.010101e+00)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_both0
-; CHECK:       lgr   2, 1
-; CHECK:       lgdr  1, 0
+; CHECK-LABEL: call_vararg_both0:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 40(5)
+; CHECK-NEXT:    lg 5, 32(5)
+; CHECK-NEXT:    lgr 2, 1
+; CHECK-NEXT:    lgdr 1, 0
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_both0(i64 %arg0, double %arg1) {
   %retval  = call i64(...) @pass_vararg3(double %arg1, i64 %arg0)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_long_double0
-; CHECK:       larl  1, @CPI5_0
-; CHECK-NEXT:  ld    0, 0(1)
-; CHECK-NEXT:  ld    2, 8(1)
-; CHECK:       lgdr  3, 0
-; CHECK:       lghi  1, 1
-; CHECK:       lghi  2, 2
-; CHECK:       std   0, 2192(4)
-; CHECK-NEXT:  std   2, 2200(4)
+; CHECK-LABEL: call_vararg_long_double0:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    larl 1, @CPI5_0
+; CHECK-NEXT:    ld 0, 0(1)
+; CHECK-NEXT:    ld 2, 8(1)
+; CHECK-NEXT:    lg 6, 8(5)
+; CHECK-NEXT:    lg 5, 0(5)
+; CHECK-NEXT:    lgdr 3, 0
+; CHECK-NEXT:    lghi 1, 1
+; CHECK-NEXT:    lghi 2, 2
+; CHECK-NEXT:    std 0, 2192(4)
+; CHECK-NEXT:    std 2, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_long_double0() {
 entry:
   %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, fp128 0xLE0FC1518450562CD4000921FB5444261)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_long_double1
-; CHECK:       lgdr  3, 0
-; CHECK:       lghi  1, 1
-; CHECK:       lghi  2, 2
-; CHECK:       std   0, 2192(4)
-; CHECK-NEXT:  std   2, 2200(4)
+; CHECK-LABEL: call_vararg_long_double1:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 8(5)
+; CHECK-NEXT:    lg 5, 0(5)
+; CHECK-NEXT:    lgdr 3, 0
+; CHECK-NEXT:    lghi 1, 1
+; CHECK-NEXT:    lghi 2, 2
+; CHECK-NEXT:    std 0, 2192(4)
+; CHECK-NEXT:    std 2, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_long_double1(fp128 %arg0) {
 entry:
   %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, fp128 %arg0)
@@ -90,22 +151,41 @@ entry:
 }
 
 ; CHECK-LABEL: call_vararg_long_double2
-; CHECK:      std   4, 2208(4)
-; CHECK-NEXT: std   6, 2216(4)
-; CHECK:      lgdr  3, 0
-; CHECK:      lghi  1, 1
-; CHECK:      lghi  2, 2
-; CHECK:      std   0, 2192(4)
-; CHECK-NEXT: std   2, 2200(4)
+; CHECK-LABEL: call_vararg_long_double2:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    std 4, 2208(4)
+; CHECK-NEXT:    std 6, 2216(4)
+; CHECK-NEXT:    lg 6, 8(5)
+; CHECK-NEXT:    lg 5, 0(5)
+; CHECK-NEXT:    lgdr 3, 0
+; CHECK-NEXT:    lghi 1, 1
+; CHECK-NEXT:    lghi 2, 2
+; CHECK-NEXT:    std 0, 2192(4)
+; CHECK-NEXT:    std 2, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_long_double2(fp128 %arg0, fp128 %arg1) {
 entry:
   %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, fp128 %arg0, fp128 %arg1)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_long_double3
-; CHECK:       lgdr 3, 2
-; CHECK-NEXT:  lgdr 2, 0
+; CHECK-LABEL: call_vararg_long_double3:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 40(5)
+; CHECK-NEXT:    lg 5, 32(5)
+; CHECK-NEXT:    lgdr 3, 2
+; CHECK-NEXT:    lgdr 2, 0
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_long_double3(fp128 %arg0) {
 entry:
   %retval = call i64 (...) @pass_vararg3(fp128 %arg0)
@@ -173,38 +253,58 @@ define void @call_vec_double_vararg_straddle(<2 x double> %v) {
   ret void
 }
 
-; CHECK-LABEL: call_vararg_integral0
-; Since arguments 0, 1, and 2 are already in the correct
-; registers, we should have no loads of any sort into
-; GPRs 1, 2, and 3.
-; CHECK-NOT: lg  1
-; CHECK-NOT: lgr  1
-; CHECK-NOT: lg  2
-; CHECK-NOT: lgr  2
-; CHECK-NOT: lg  3
-; CHECK-NOT: lgr  3
+; CHECK-LABEL: call_vararg_integral0:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 0, 2392(4)
+; CHECK-NEXT:    lg 6, 40(5)
+; CHECK-NEXT:    lg 5, 32(5)
+; CHECK-NEXT:    stg 0, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_integral0(i32 signext %arg0, i16 signext %arg1, i64 signext %arg2, i8 signext %arg3) {
 entry:
   %retval = call i64(...) @pass_vararg3(i32 signext %arg0, i16 signext %arg1, i64 signext %arg2, i8 signext %arg3)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_float0
-; CHECK:       lghi  1, 1
-; CHECK:       llihf 2, 1073692672
+; CHECK-LABEL: call_vararg_float0:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 24(5)
+; CHECK-NEXT:    lg 5, 16(5)
+; CHECK-NEXT:    lghi 1, 1
+; CHECK-NEXT:    llihf 2, 1073692672
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_float0() {
 entry:
   %retval = call i64 (i64, ...) @pass_vararg2(i64 1, float 1.953125)
   ret i64 %retval
 }
 
-; CHECK-LABEL: call_vararg_float1
-; CHECK:       larl  1, @CPI17_0
-; CHECK:       le  0, 0(1)
-; CHECK:       llihf 0, 1073692672
-; CHECK:       llihh 2, 16384
-; CHECK:       llihh 3, 16392
-; CHECK:       stg  0, 2200(4)
+; CHECK-LABEL: call_vararg_float1:
+; CHECK:         stmg 6, 7, 1872(4)
+; CHECK-NEXT:    aghi 4, -192
+; CHECK-NEXT:    lg 6, 72(5)
+; CHECK-NEXT:    lg 5, 64(5)
+; CHECK-NEXT:    larl 1, @CPI17_0
+; CHECK-NEXT:    le 0, 0(1)
+; CHECK-NEXT:    llihf 0, 1073692672
+; CHECK-NEXT:    llihh 2, 16384
+; CHECK-NEXT:    llihh 3, 16392
+; CHECK-NEXT:    stg 0, 2200(4)
+; CHECK-NEXT:    basr 7, 6
+; CHECK-NEXT:    bcr 0, 0
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 192
+; CHECK-NEXT:    b 2(7)
 define i64 @call_vararg_float1() {
 entry:
   %retval = call i64 (float, ...) @pass_vararg4(float 1.0, float 2.0, float 3.0, float 1.953125)
@@ -224,9 +324,16 @@ entry:
 ; }
 ;
 ; CHECK-LABEL: pass_vararg:
-; CHECK: aghi    4, -160
-; CHECK: la      0, 2208(4)
-; CHECK: stg     0, 2200(4)
+; CHECK:         stmg 6, 7, 1904(4)
+; CHECK-NEXT:    aghi 4, -160
+; CHECK-NEXT:    stg 2, 2344(4)
+; CHECK-NEXT:    stg 3, 2352(4)
+; CHECK-NEXT:    la 0, 2352(4)
+; CHECK-NEXT:    stg 0, 2200(4)
+; CHECK-NEXT:    lg 3, 2344(4)
+; CHECK-NEXT:    lg 7, 2072(4)
+; CHECK-NEXT:    aghi 4, 160
+; CHECK-NEXT:    b 2(7)
 define hidden i64 @pass_vararg(i64 %x, ...) {
 entry:
   %va = alloca ptr, align 8

>From bf0534e8702a10c9085720711fde0cbbc532ac87 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek at codeweavers.com>
Date: Thu, 12 Oct 2023 12:49:16 +0200
Subject: [PATCH 06/10] [llvm-lib] [Object] Use ECSYMBOLS section for ARM64EC
 importlib symbols. (#68328)

---
 llvm/include/llvm/Object/COFFImportFile.h    |  2 +
 llvm/lib/Object/ArchiveWriter.cpp            |  5 ++
 llvm/lib/Object/COFFImportFile.cpp           |  3 +-
 llvm/test/tools/llvm-lib/arm64ec-implib.test | 54 ++++++++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-lib/arm64ec-implib.test

diff --git a/llvm/include/llvm/Object/COFFImportFile.h b/llvm/include/llvm/Object/COFFImportFile.h
index 3d148112dcbb61f..0fb65fabdbcad59 100644
--- a/llvm/include/llvm/Object/COFFImportFile.h
+++ b/llvm/include/llvm/Object/COFFImportFile.h
@@ -63,6 +63,8 @@ class COFFImportFile : public SymbolicFile {
         Data.getBufferStart());
   }
 
+  uint16_t getMachine() const { return getCOFFImportHeader()->Machine; }
+
 private:
   bool isData() const {
     return getCOFFImportHeader()->getType() == COFF::IMPORT_DATA;
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index aa74a2f74ff576c..a67c657b48ba071 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
@@ -659,6 +660,10 @@ static bool isECObject(object::SymbolicFile &Obj) {
     return cast<llvm::object::COFFObjectFile>(&Obj)->getMachine() !=
            COFF::IMAGE_FILE_MACHINE_ARM64;
 
+  if (Obj.isCOFFImportFile())
+    return cast<llvm::object::COFFImportFile>(&Obj)->getMachine() !=
+           COFF::IMAGE_FILE_MACHINE_ARM64;
+
   if (Obj.isIR()) {
     Expected<std::string> TripleStr =
         getBitcodeTargetTriple(Obj.getMemoryBufferRef());
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index a567ecde9b7afc5..2cca1f728cc713b 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -612,7 +612,8 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
 
   return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab,
                       MinGW ? object::Archive::K_GNU : object::Archive::K_COFF,
-                      /*Deterministic*/ true, /*Thin*/ false);
+                      /*Deterministic*/ true, /*Thin*/ false,
+                      /*OldArchiveBuf*/ nullptr, isArm64EC(Machine));
 }
 
 } // namespace object
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
new file mode 100644
index 000000000000000..ee8b134d06b1667
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -0,0 +1,54 @@
+Test creating ARM64EC importlib.
+
+RUN: split-file %s %t.dir && cd %t.dir
+RUN: llvm-lib -machine:arm64ec -def:test.def -out:test.lib
+
+RUN: llvm-nm --print-armap test.lib | FileCheck -check-prefix=ARMAP %s
+
+ARMAP:      Archive EC map
+ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
+ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAP-NEXT: __imp_dataexp in test.dll
+ARMAP-NEXT: __imp_funcexp in test.dll
+ARMAP-NEXT: funcexp in test.dll
+ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
+
+RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
+
+READOBJ:      File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-ARM64EC
+READOBJ-NEXT: Arch: aarch64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-ARM64EC
+READOBJ-NEXT: Arch: aarch64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: Format: COFF-ARM64
+READOBJ-NEXT: Arch: aarch64
+READOBJ-NEXT: AddressSize: 64bit
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: name
+READOBJ-NEXT: Symbol: __imp_funcexp
+READOBJ-NEXT: Symbol: funcexp
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file
+READOBJ-NEXT: Type: data
+READOBJ-NEXT: Name type: name
+READOBJ-NEXT: Symbol: __imp_dataexp
+
+Creating a new lib containing the existing lib:
+RUN: llvm-lib -machine:arm64ec test.lib -out:test2.lib
+RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s
+
+#--- test.def
+LIBRARY test.dll
+EXPORTS
+    funcexp
+    dataexp DATA

>From 22b6b8d7b53105bb7fbcbb94c7dd77ce028a8c23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 12 Oct 2023 07:09:12 +0200
Subject: [PATCH 07/10] [clang][Interp][NFC] Remove Pointer.h include from
 Function.h

---
 clang/lib/AST/Interp/Function.h      | 3 ++-
 clang/lib/AST/Interp/InterpStack.cpp | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Function.h b/clang/lib/AST/Interp/Function.h
index 0bae314e97701d9..b93477c56346a9d 100644
--- a/clang/lib/AST/Interp/Function.h
+++ b/clang/lib/AST/Interp/Function.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_CLANG_AST_INTERP_FUNCTION_H
 #define LLVM_CLANG_AST_INTERP_FUNCTION_H
 
-#include "Pointer.h"
 #include "Source.h"
+#include "Descriptor.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/Decl.h"
 #include "llvm/Support/raw_ostream.h"
@@ -25,6 +25,7 @@ namespace clang {
 namespace interp {
 class Program;
 class ByteCodeEmitter;
+class Pointer;
 enum PrimType : uint32_t;
 
 /// Describes a scope block.
diff --git a/clang/lib/AST/Interp/InterpStack.cpp b/clang/lib/AST/Interp/InterpStack.cpp
index da4b36f8c1bf351..18a34079c3b16ae 100644
--- a/clang/lib/AST/Interp/InterpStack.cpp
+++ b/clang/lib/AST/Interp/InterpStack.cpp
@@ -10,6 +10,7 @@
 #include "Boolean.h"
 #include "Floating.h"
 #include "Integral.h"
+#include "Pointer.h"
 #include <cassert>
 #include <cstdlib>
 

>From 88e9ea98cd49c20b188466bcf518218940405b11 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 4 Oct 2023 13:36:25 -0700
Subject: [PATCH 08/10] [AMDGPU] Change the representation of double literals
 in operands

A 64-bit literal can be used as a 32-bit zero or sign extended
operand. In case of double zeroes are added to the low 32 bits.
Currently asm parser stores only high 32 bits of a double into
an operand. To support codegen as requested by the
https://github.com/llvm/llvm-project/issues/67781 we need to
change the representation to store a full 64-bit value so that
codegen can simply add immediates to an instruction.

There is some code to support compatibility with existing tests
and asm kernels. We allow to use short hex strings to represent
only a high 32 bit of a double value as a valid literal.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 21 ++++++++++++--
 .../Disassembler/AMDGPUDisassembler.cpp       | 28 ++++++++++++++-----
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  9 ++++--
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 12 +++++---
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 +-
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp      |  5 +++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  4 ++-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  7 +++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++
 9 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 1e07e8deb560fcb..253a2e98f0cb685 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2141,9 +2141,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
           const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
           "Can't encode literal as exact 64-bit floating-point operand. "
           "Low 32-bits will be set to zero");
+          Val &= 0xffffffff00000000u;
         }
 
-        Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+        Inst.addOperand(MCOperand::createImm(Val));
         setImmKindLiteral();
         return;
       }
@@ -2242,7 +2243,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       return;
     }
 
-    Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+    if (isInt<32>(Val) || isUInt<32>(Val))
+      Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? Val << 32 : Lo_32(Val);
+
+    Inst.addOperand(MCOperand::createImm(Val));
     setImmKindLiteral();
     return;
 
@@ -4309,7 +4313,18 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
       continue;
 
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
-      uint32_t Value = static_cast<uint32_t>(MO.getImm());
+      uint64_t Value = static_cast<uint64_t>(MO.getImm());
+      bool IsFP = AMDGPU::isSISrcFPOperand(Desc, OpIdx);
+      bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP);
+
+      if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) {
+        Error(getLitLoc(Operands), "invalid operand for instruction");
+        return false;
+      }
+
+      if (IsFP && IsValid32Op)
+        Value = Hi_32(Value);
+
       if (NumLiterals == 0 || LiteralValue != Value) {
         LiteralValue = Value;
         ++NumLiterals;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index a504a5e86760bd6..83d973dc62e7770 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -378,6 +378,15 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
+static DecodeStatus
+decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, uint64_t Addr,
+                       const MCDisassembler *Decoder) {
+  assert(Imm < (1 << 9) && "9-bit encoding");
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm,
+                                            false, 64, true));
+}
+
 static DecodeStatus
 DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
                              const MCDisassembler *Decoder) {
@@ -1219,7 +1228,7 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
   return MCOperand::createImm(Literal);
 }
 
-MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
@@ -1229,9 +1238,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
                         Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal = eatBytes<uint32_t>(Bytes);
+    Literal = Literal64 = eatBytes<uint32_t>(Bytes);
+    if (ExtendFP64)
+      Literal64 <<= 32;
   }
-  return MCOperand::createImm(Literal);
+  return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1448,7 +1459,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
 
 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
                                           bool MandatoryLiteral,
-                                          unsigned ImmWidth) const {
+                                          unsigned ImmWidth,
+                                          bool IsFP) const {
   using namespace AMDGPU::EncValues;
 
   assert(Val < 1024); // enum10
@@ -1460,13 +1472,15 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
     return createRegOperand(IsAGPR ? getAgprClassId(Width)
                                    : getVgprClassId(Width), Val - VGPR_MIN);
   }
-  return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth);
+  return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
+                            IsFP);
 }
 
 MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
                                                  unsigned Val,
                                                  bool MandatoryLiteral,
-                                                 unsigned ImmWidth) const {
+                                                 unsigned ImmWidth,
+                                                 bool IsFP) const {
   // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
   // decoded earlier.
   assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
@@ -1494,7 +1508,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
       // Keep a sentinel value for deferred setting
       return MCOperand::createImm(LITERAL_CONST);
     else
-      return decodeLiteralConstant();
+      return decodeLiteralConstant(IsFP && ImmWidth == 64);
   }
 
   switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5f3b277d577ff7c..865db2b26307b43 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -97,6 +97,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   const unsigned TargetMaxInstBytes;
   mutable ArrayRef<uint8_t> Bytes;
   mutable uint32_t Literal;
+  mutable uint64_t Literal64;
   mutable bool HasLiteral;
   mutable std::optional<bool> EnableWavefrontSize32;
 
@@ -229,15 +230,17 @@ class AMDGPUDisassembler : public MCDisassembler {
   static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm);
 
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
-  MCOperand decodeLiteralConstant() const;
+  MCOperand decodeLiteralConstant(bool ExtendFP64) const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
                         bool MandatoryLiteral = false,
-                        unsigned ImmWidth = 0) const;
+                        unsigned ImmWidth = 0,
+                        bool IsFP = false) const;
 
   MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
                                bool MandatoryLiteral = false,
-                               unsigned ImmWidth = 0) const;
+                               unsigned ImmWidth = 0,
+                               bool IsFP = false) const;
 
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ad4c48a8d65581a..40e92f00a9e52a6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -426,7 +426,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
 
 void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
                                          const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
+                                         raw_ostream &O, bool IsFP) {
   int64_t SImm = static_cast<int64_t>(Imm);
   if (SImm >= -16 && SImm <= 64) {
     O << SImm;
@@ -454,6 +454,8 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
   else if (Imm == 0x3fc45f306dc9c882 &&
            STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494309189532";
+  else if (IsFP && AMDGPU::isValid32BitLiteral(Imm, true))
+    O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
   else {
     assert(isUInt<32>(Imm) || isInt<32>(Imm));
 
@@ -605,11 +607,13 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       printImmediate32(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_IMM_INT64:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+      printImmediate64(Op.getImm(), STI, O, false);
+      break;
+    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-      printImmediate64(Op.getImm(), STI, O);
+      printImmediate64(Op.getImm(), STI, O, true);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
@@ -671,7 +675,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       if (RCBits == 32)
         printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
       else if (RCBits == 64)
-        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O);
+        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true);
       else
         llvm_unreachable("Invalid register class size");
     }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3b14faab136b35a..dc83547a4afe049 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -91,7 +91,7 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
-                        raw_ostream &O);
+                        raw_ostream &O, bool IsFP);
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printRegularOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 21243f80e055499..d93f747bf6f0a64 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -411,7 +411,10 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
     } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
       llvm_unreachable("Must be immediate or expr");
 
-    support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little);
+    if (Desc.operands()[i].OperandType == AMDGPU::OPERAND_REG_IMM_FP64)
+      Imm = Hi_32(Imm);
+
+    support::endian::write<uint32_t>(CB, Imm, support::endianness::little);
 
     // Only one literal value allowed
     break;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c3c5bfae405aa45..ea06e85fb400c1b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1263,7 +1263,9 @@ def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">;
-def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrc_f64";
+}
 def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">;
 def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d0ad763d9e6cc1..e7907b28abedf9d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2519,6 +2519,13 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   return Lo16 == Hi16;
 }
 
+bool isValid32BitLiteral(uint64_t Val, bool IsFP) {
+  if (IsFP)
+    return !(Val & 0xffffffffu);
+
+  return isUInt<32>(Val) || isInt<32>(Val);
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 297a69f54d63721..fbe9adfd74fa9c6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1290,6 +1290,9 @@ bool isInlinableIntLiteralV216(int32_t Literal);
 LLVM_READNONE
 bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isValid32BitLiteral(uint64_t Val, bool IsFP);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
 bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);

>From 0bb5d0690804e371c7d506f4159d1a26617b6f5c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 12 Oct 2023 01:16:56 -0700
Subject: [PATCH 09/10] [AMDGPU] Make clang-format happy with disasm cganges

---
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp      | 13 ++++++-------
 .../Target/AMDGPU/Disassembler/AMDGPUDisassembler.h |  6 ++----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 83d973dc62e7770..d74fd0b3a9ea74e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -378,13 +378,13 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
-static DecodeStatus
-decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                       const MCDisassembler *Decoder) {
+static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
   assert(Imm < (1 << 9) && "9-bit encoding");
   auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
-  return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm,
-                                            false, 64, true));
+  return addOperand(
+      Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64, true));
 }
 
 static DecodeStatus
@@ -1459,8 +1459,7 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
 
 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
                                           bool MandatoryLiteral,
-                                          unsigned ImmWidth,
-                                          bool IsFP) const {
+                                          unsigned ImmWidth, bool IsFP) const {
   using namespace AMDGPU::EncValues;
 
   assert(Val < 1024); // enum10
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 865db2b26307b43..91b73b593d61617 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -233,14 +233,12 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeLiteralConstant(bool ExtendFP64) const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
-                        bool MandatoryLiteral = false,
-                        unsigned ImmWidth = 0,
+                        bool MandatoryLiteral = false, unsigned ImmWidth = 0,
                         bool IsFP = false) const;
 
   MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
                                bool MandatoryLiteral = false,
-                               unsigned ImmWidth = 0,
-                               bool IsFP = false) const;
+                               unsigned ImmWidth = 0, bool IsFP = false) const;
 
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;

>From e2e1efa0e821bbe88b8fb9505d37a6fda8f9209b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 12 Oct 2023 12:18:08 -0700
Subject: [PATCH 10/10] Change argument of AMDGPU::isValid32BitLiteral to
 IsFP64

---
 llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 7 ++++---
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp      | 4 ++--
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h        | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 253a2e98f0cb685..ba967c303d0924f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4314,15 +4314,16 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
 
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
       uint64_t Value = static_cast<uint64_t>(MO.getImm());
-      bool IsFP = AMDGPU::isSISrcFPOperand(Desc, OpIdx);
-      bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP);
+      bool IsFP64 = AMDGPU::isSISrcFPOperand(Desc, OpIdx) &&
+                    AMDGPU::getOperandSize(Desc.operands()[OpIdx]) == 8;
+      bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP64);
 
       if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) {
         Error(getLitLoc(Operands), "invalid operand for instruction");
         return false;
       }
 
-      if (IsFP && IsValid32Op)
+      if (IsFP64 && IsValid32Op)
         Value = Hi_32(Value);
 
       if (NumLiterals == 0 || LiteralValue != Value) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e7907b28abedf9d..d123b384a27d4cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2519,8 +2519,8 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   return Lo16 == Hi16;
 }
 
-bool isValid32BitLiteral(uint64_t Val, bool IsFP) {
-  if (IsFP)
+bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
+  if (IsFP64)
     return !(Val & 0xffffffffu);
 
   return isUInt<32>(Val) || isInt<32>(Val);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fbe9adfd74fa9c6..bb2964f592f66bf 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1291,7 +1291,7 @@ LLVM_READNONE
 bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
-bool isValid32BitLiteral(uint64_t Val, bool IsFP);
+bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
 
 bool isArgPassedInSGPR(const Argument *Arg);