[PATCH] D91927: [X86] Add x86_amx type for intel AMX.

Tue Nov 24 21:39:03 PST 2020

LuoYuanke marked an inline comment as done.
LuoYuanke added inline comments.

================
Comment at: llvm/lib/IR/DataLayout.cpp:819
+  case Type::X86_AMXTyID:
+    return Align(64);
   default:
----------------
pengfei wrote:
> Should be 512 bits?
Yes. It is 512. Thanks.

================
Comment at: llvm/lib/Target/X86/X86LowerAMXType.cpp:72
   LLVMContext &Ctx = Builder.getContext();
-  Type *Ty = LD->getType();
-  EVT VT = EVT::getEVT(Ty);
-  EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
-  Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
-
-  Value *Ptr = LD->getPointerOperand();
-  PointerType *HalfPtrTy = HalfTy->getPointerTo(LD->getPointerAddressSpace());
-  Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
-  // The HW require the alignment for AMX tile is 64, but front-end generate
-  // code for the vector alignment which is the vector size.
-  uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
-  Align Alignment = std::min(LD->getAlign(), Align(HalfTySize));
-  auto *Lo =
-      Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
-
-  HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
-  auto *Hi =
-      Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
-
-  LoadMap[Inst] = std::make_pair(Lo, Hi);
-}
-
-bool X86LowerAMXType::visitLD() {
-  if (LDSet.empty())
-    return false;
-  for (auto &Inst : LDSet) {
-    int Count = 0;
-    Value *NewInst = nullptr;
-    // The user should be all AMX intrinsics or all LLVM instruction.
-    // Don't support it is used by both AMX intrinsics and LLVM instructions.
-    for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
-      Use &U = *I++;
-      const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U.getUser());
-      if (!II) {
-        Count++;
-        continue;
-      }
-      if (NewInst)
-        continue;
-      Value *Row, *Col;
-      switch (II->getIntrinsicID()) {
-      default:
-        report_fatal_error("Non-AMX intrinsic use tile type.");
-        break;
-      case Intrinsic::x86_tdpbssd_internal: {
-        unsigned OpNo = U.getOperandNo();
-        switch (OpNo) {
-        case 3:
-          Row = II->getArgOperand(0);
-          Col = II->getArgOperand(1);
-          break;
-        case 4:
-          Row = II->getArgOperand(0);
-          Col = II->getArgOperand(2);
-          break;
-        case 5:
-          Row = II->getArgOperand(2);
-          Col = II->getArgOperand(1);
-          break;
-        }
-        break;
-      }
-      case Intrinsic::x86_tilestored64_internal: {
-        Row = II->getArgOperand(0);
-        Col = II->getArgOperand(1);
-        break;
-      }
-      }
-      assert(Count == 0 && "Can NOT mix amx intrinsic and LLVM instruction");
-      // FIXME: The shape def should be ahead of load.
-      IRBuilder<> Builder(Inst);
-      LLVMContext &Ctx = Builder.getContext();
-      // Use the maximun column as stride.
-      Value *Stride = Builder.getInt64(64);
-      Value *I8Ptr =
-          Builder.CreateBitCast(Inst->getOperand(0), Type::getInt8PtrTy(Ctx));
-      std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
-
-      NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
-                                        None, Args);
-
-      Inst->replaceAllUsesWith(NewInst);
-    }
-    if (!NewInst)
-      splitLD(Inst);
+  AllocaInst *AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+  Value *I8Ptr =
----------------
craig.topper wrote:
> Shouldn't this be in the function's entry block?
Yes. It is in function's entry block. It is done in line 48 of function CreateAllocaInst(). CreateAllocaInst() is actually copied from your code. :)

================
Comment at: llvm/lib/Target/X86/X86LowerAMXType.cpp:79
+    // -->
+    // %addr = alloca <256 x i32>, align 1024
+    // store <256 x i32> %src, <256 x i32>* %addr, align 1024
----------------
pengfei wrote:
> Why the alignment not be 64?
1024 is conservatives, because vector require the alignment to be the vector size. Here generate vector <256 x i32> load/store.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D91927/new/

https://reviews.llvm.org/D91927