[PATCH] D137923: [X86][AMX] Fix the shape dependency issue.

Tue Nov 15 00:54:00 PST 2022

xiangzhangllvm added a comment.

How about merge the " load + cast" to the cast position not load. 
for example generate the tileload for line 95 105 to line 105:

   89 *** IR Dump After Lower AMX intrinsics (lower-amx-intrinsics) ***
   90 define void @test_tile_dpbssd(ptr byval(%struct.__tile1024i_str) align 64 %a, ptr byval(%struct.__tile1024i_str) align 64 %b, ptr byval(%struct.__tile1024i_str) alig    n 64 %c) {
   91 entry:
   92   %b.row.ptr = getelementptr inbounds i8, ptr %b, i64 2
   93   %b.row = load i16, ptr %b.row.ptr, align 2
   94   %b.tile.ptr = getelementptr inbounds i8, ptr %b, i64 64
   95   %b.tile = load <256 x i32>, ptr %b.tile.ptr, align 64
   96   %a.row = load i16, ptr %a, align 64
   97   %a.col.ptr = getelementptr inbounds i8, ptr %a, i64 2
   98   %a.col = load i16, ptr %a.col.ptr, align 2
   99   %a.tile.ptr = getelementptr inbounds i8, ptr %a, i64 64
  100   %a.tile = load <256 x i32>, ptr %a.tile.ptr, align 64
  101   %c.tile.ptr = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i64 0, i32 3
  102   %c.tile = load <256 x i32>, ptr %c.tile.ptr, align 64
  103   %c.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %c.tile)
  104   %a.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %a.tile)
  105   %b.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %b.tile)
  106   %res = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %a.row, i16 %b.row, i16 %a.col, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
  107   ret void
  108 }

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D137923/new/

https://reviews.llvm.org/D137923