[clang] [llvm] [X86] Remove AMX-TRANSPOSE (PR #165556)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Oct 29 06:32:17 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-driver
@llvm/pr-subscribers-clang
Author: Mikołaj Piróg (mikolaj-pirog)
<details>
<summary>Changes</summary>
Per Intel Architecture Instruction Set Extensions Programming Reference rev. 59 (https://cdrdv2.intel.com/v1/dl/getContent/671368), Revision History entry for revision -59, AMX-TRANSPOSE was removed
---
Patch is 526.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165556.diff
81 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86_64.td (-89)
- (modified) clang/include/clang/Driver/Options.td (-2)
- (modified) clang/lib/Basic/Targets/X86.cpp (-6)
- (modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (-68)
- (modified) clang/lib/Headers/CMakeLists.txt (-6)
- (removed) clang/lib/Headers/amxbf16transposeintrin.h (-94)
- (removed) clang/lib/Headers/amxcomplextransposeintrin.h (-303)
- (removed) clang/lib/Headers/amxfp16transposeintrin.h (-94)
- (modified) clang/lib/Headers/amxintrin.h (-2)
- (removed) clang/lib/Headers/amxmovrstransposeintrin.h (-200)
- (removed) clang/lib/Headers/amxtf32transposeintrin.h (-105)
- (removed) clang/lib/Headers/amxtransposeintrin.h (-248)
- (modified) clang/lib/Headers/immintrin.h (-12)
- (modified) clang/lib/Sema/SemaX86.cpp (-17)
- (removed) clang/test/CodeGen/X86/amx_movrs_tranpose.c (-53)
- (removed) clang/test/CodeGen/X86/amx_movrs_tranpose_api.c (-81)
- (removed) clang/test/CodeGen/X86/amx_movrs_transpose_errors.c (-22)
- (modified) clang/test/CodeGen/X86/amx_tf32.c (-5)
- (modified) clang/test/CodeGen/X86/amx_tf32_api.c (-7)
- (modified) clang/test/CodeGen/X86/amx_tf32_errors.c (-8)
- (removed) clang/test/CodeGen/X86/amx_transpose.c (-75)
- (removed) clang/test/CodeGen/X86/amx_transpose_api.c (-114)
- (removed) clang/test/CodeGen/X86/amx_transpose_errors.c (-75)
- (modified) clang/test/Driver/x86-target-features.c (-7)
- (modified) clang/test/Preprocessor/predefined-arch-macros.c (-2)
- (modified) clang/test/Preprocessor/x86_target_features.c (-12)
- (modified) llvm/include/llvm/CodeGen/TileShapeInfo.h (+7-81)
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (-104)
- (modified) llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h (-1)
- (modified) llvm/include/llvm/TargetParser/X86TargetParser.def (-1)
- (modified) llvm/lib/Target/X86/AsmParser/X86Operand.h (-31)
- (modified) llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp (-5)
- (modified) llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h (-7)
- (modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp (-19)
- (modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h (-1)
- (modified) llvm/lib/Target/X86/X86.td (+1-5)
- (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (-167)
- (modified) llvm/lib/Target/X86/X86FastPreTileConfig.cpp (+8-24)
- (modified) llvm/lib/Target/X86/X86FastTileConfig.cpp (-6)
- (modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+2-76)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-154)
- (modified) llvm/lib/Target/X86/X86InstrAMX.td (-208)
- (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+3-12)
- (modified) llvm/lib/Target/X86/X86InstrOperands.td (-7)
- (modified) llvm/lib/Target/X86/X86InstrPredicates.td (-1)
- (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+26-177)
- (modified) llvm/lib/Target/X86/X86PreTileConfig.cpp (-9)
- (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+4-66)
- (modified) llvm/lib/Target/X86/X86RegisterInfo.td (+2-11)
- (modified) llvm/lib/Target/X86/X86TileConfig.cpp (+12-71)
- (modified) llvm/lib/TargetParser/Host.cpp (-1)
- (modified) llvm/lib/TargetParser/X86TargetParser.cpp (+1-1)
- (modified) llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt (-22)
- (modified) llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt (-22)
- (modified) llvm/test/CodeGen/X86/amx-tf32-internal.ll (+2-5)
- (modified) llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll (+1-11)
- (removed) llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll (-122)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll (-136)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir (-165)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir (-153)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_copy.mir (-97)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll (-87)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll (-61)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir (-134)
- (removed) llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir (-113)
- (removed) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (-371)
- (modified) llvm/test/CodeGen/X86/ipra-reg-usage.ll (+2-2)
- (modified) llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt (-128)
- (modified) llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt (-8)
- (removed) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (-154)
- (modified) llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s (-128)
- (modified) llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s (-128)
- (modified) llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s (-7)
- (modified) llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s (-7)
- (removed) llvm/test/MC/X86/amx-transpose-att.s (-153)
- (removed) llvm/test/MC/X86/amx-transpose-intel.s (-153)
- (modified) llvm/test/TableGen/x86-instr-mapping.inc (-8)
- (modified) llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt (+26-26)
- (modified) llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt (+5711-5733)
- (modified) llvm/unittests/CodeGen/InstrRefLDVTest.cpp (+1-1)
- (modified) llvm/utils/TableGen/X86RecognizableInstr.cpp (-1)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index 275278c5ac089..062060e6afbbe 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -239,57 +239,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
}
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
- def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
- def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
- def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
- def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
@@ -303,10 +252,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
}
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
- def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
let Features = "amx-fp8", Attributes = [NoThrow] in {
def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
@@ -321,13 +266,6 @@ let Features = "amx-tile", Attributes = [NoThrow] in {
def tilezero : X86Builtin<"void(unsigned char)">;
}
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-}
-
let Features = "amx-movrs", Attributes = [NoThrow] in {
def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
@@ -359,29 +297,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
}
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
- def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
- def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
- def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
- def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
@@ -406,10 +321,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
}
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
- def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
let Features = "prefetchi", Attributes = [NoThrow, Const] in {
def prefetchi : X86Builtin<"void(void const *, unsigned int)">;
}
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8784c9d7d206d..1d11db1209e47 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6695,8 +6695,6 @@ def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group<m_x86_Features_Group>;
def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group<m_x86_Features_Group>;
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
-def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
-def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index e71f10c4c16fc..7a90c89dd7dc0 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXFP8 = true;
} else if (Feature == "+amx-movrs") {
HasAMXMOVRS = true;
- } else if (Feature == "+amx-transpose") {
- HasAMXTRANSPOSE = true;
} else if (Feature == "+amx-avx512") {
HasAMXAVX512 = true;
} else if (Feature == "+amx-tf32") {
@@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP8__");
if (HasAMXMOVRS)
Builder.defineMacro("__AMX_MOVRS__");
- if (HasAMXTRANSPOSE)
- Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasAMXAVX512)
Builder.defineMacro("__AMX_AVX512__");
if (HasAMXTF32)
@@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-movrs", true)
.Case("amx-tf32", true)
.Case("amx-tile", true)
- .Case("amx-transpose", true)
.Case("avx", true)
.Case("avx10.1", true)
.Case("avx10.2", true)
@@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-movrs", HasAMXMOVRS)
.Case("amx-tf32", HasAMXTF32)
.Case("amx-tile", HasAMXTILE)
- .Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
.Case("avx10.1", HasAVX10_1)
.Case("avx10.2", HasAVX10_2)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b924407b6ddd7..2381b2e7cf2cf 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// instruction, but it will create a memset that won't be optimized away.
return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
}
- // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
- case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
- Intrinsic::ID IID;
- switch (BuiltinID) {
- default:
- llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
- IID = Intrinsic::x86_t2rpntlvwz0_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
- IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
- IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
- IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
- IID = Intrinsic::x86_t2rpntlvwz1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
- IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
- IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
- IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
- break;
- }
-
- // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
- Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
- {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
-
- auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
- assert(PtrTy && "arg3 must be of pointer type");
- QualType PtreeTy = PtrTy->getPointeeType();
- llvm::Type *TyPtee = ConvertType(PtreeTy);
-
- // Bitcast amx type (x86_amx) to vector type (256 x i32)
- // Then store tile0 into DstPtr0
- Value *T0 = Builder.CreateExtractValue(Call, 0);
- Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
- {TyPtee}, {T0});
- Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
-
- // Then store tile1 into DstPtr1
- Value *T1 = Builder.CreateExtractValue(Call, 1);
- Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
- {TyPtee}, {T1});
- Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
-
- // Note: Here we escape directly use x86_tilestored64_internal to store
- // the results due to it can't make sure the Mem written scope. This may
- // cause shapes reloads after first amx intrinsic, which current amx reg-
- // ister allocation has no ability to handle it.
-
- return Store;
- }
case X86::BI__ud2:
// llvm.trap makes a ud2a instruction on x86.
return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 18589125697b0..33fff7645df65 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -162,18 +162,12 @@ set(x86_files
adxintrin.h
ammintrin.h
amxavx512intrin.h
- amxbf16transposeintrin.h
amxcomplexintrin.h
- amxcomplextransposeintrin.h
amxfp16intrin.h
- amxfp16transposeintrin.h
amxfp8intrin.h
amxintrin.h
amxmovrsintrin.h
- amxmovrstransposeintrin.h
amxtf32intrin.h
- amxtf32transposeintrin.h
- amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
deleted file mode 100644
index 86f09f2ad8db2..0000000000000
--- a/clang/lib/Headers/amxbf16transposeintrin.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_BF16TRANSPOSEINTRIN_H
-#define __AMX_BF16TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("amx-bf16,amx-transpose")))
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-/// tiles \a a and \a b, accumulating the intermediate single-precision
-/// (32-bit) floating-point elements with elements in \a dst, and store the
-/// 32-bit result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-/// tmp := dst.row[m]
-/// FOR k := 0 TO (a.colsb / 4) - 1
-/// FOR n := 0 TO (dst.colsb / 4) - 1
-/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
-/// FP32(b.row[k].bf16[2*n+0])
-/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
-/// FP32(b.row[k].bf16[2*n+1])
-/// ENDFOR
-/// ENDFOR
-/// write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param a
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-/// tiles src0 and src1, accumulating the intermediate single-precision
-/// (32-bit) floating-point elements with elements in "dst", and store the
-/// 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __x86_64__ */
-#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
deleted file mode 100644
index 11abaf98e9371..0000000000000
--- a/clang/lib/Headers/amxcomplextransposeintrin.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
-#define __AMX_COMPLEXTRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("amx-complex,amx-transpose")))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-/// accumulate the results into a packed single precision tile. Each dword
-/// element in input tiles \a a and \a b is interpreted as a complex number
-/// with FP16 real part and FP16 imaginary part.
-/// Calculates the imaginary part of the result. For each possible combination
-/// of (transposed column of \a a, column of \a b), it performs a set of
-/// multiplication and accumulations on all corre...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/165556
More information about the cfe-commits
mailing list