[clang] [compiler-rt] [lldb] [llvm] [mlir] [SPIRV] Add reads from image buffer for shaders. (PR #115178)

Steven Perron via cfe-commits cfe-commits at lists.llvm.org
Thu Nov 7 08:13:00 PST 2024


https://github.com/s-perron updated https://github.com/llvm/llvm-project/pull/115178

>From f6295706b7353bea41651e8cb73e57c39204eb5f Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron at google.com>
Date: Fri, 20 Sep 2024 11:02:12 -0400
Subject: [PATCH 01/20] [SPIRV] Add reads from image buffer for shaders.

This commit adds an intrinsic that will read from an image buffer. We
chose to match the name of the DXIL intrinsic for simplicity in clang.

We cannot reuse the existing openCL readimage function because that is
not a reserved name in HLSL.

I considered trying to refactor generateReadImageInst, so that we could
share code between the two implementations. However, most of the code in
generateReadImageInst is concerned with trying to figure out which type
of image read is being done. Once we factor out the code that will be
common, then we end up with just a single call to the MIRBuilder being
common.
---
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   6 +
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp |  20 ++++
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h   |   9 ++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 108 ++++++++++++++++++
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |  20 ++++
 .../SPIRV/hlsl-resources/BufferLoad.ll        |  66 +++++++++++
 ...lslBufferLoad.ll => ScalarResourceType.ll} |   0
 .../SPIRV/hlsl-resources/UnknownBufferLoad.ll |  30 +++++
 8 files changed, 259 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
 rename llvm/test/CodeGen/SPIRV/hlsl-resources/{HlslBufferLoad.ll => ScalarResourceType.ll} (100%)
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 629d6759dd65fd..5ce168982ea5d6 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -101,4 +101,10 @@ let TargetPrefix = "spv" in {
             [IntrNoMem]>;
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
+
+  // Read a value from the image buffer. It does not translate directly to a
+  // single OpImageRead because the result type is not necessarily a 4 element
+  // vector.
+  def int_spv_typedBufferLoad
+      : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index f66506beaa6ed6..694497f9e68675 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -1056,6 +1056,11 @@ SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg,
   return nullptr;
 }
 
+SPIRVType *SPIRVGlobalRegistry::getResultType(Register VReg) {
+  MachineInstr *Instr = CurMF->getRegInfo().getVRegDef(VReg);
+  return getSPIRVTypeForVReg(Instr->getOperand(1).getReg());
+}
+
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
     SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
@@ -1126,6 +1131,21 @@ SPIRVGlobalRegistry::getScalarOrVectorComponentCount(SPIRVType *Type) const {
              : 1;
 }
 
+SPIRVType *
+SPIRVGlobalRegistry::getScalarOrVectorComponentType(Register VReg) const {
+  return getScalarOrVectorComponentType(getSPIRVTypeForVReg(VReg));
+}
+
+SPIRVType *
+SPIRVGlobalRegistry::getScalarOrVectorComponentType(SPIRVType *Type) const {
+  if (!Type)
+    return nullptr;
+  Register ScalarReg = Type->getOpcode() == SPIRV::OpTypeVector
+                           ? Type->getOperand(1).getReg()
+                           : Type->getOperand(0).getReg();
+  return getSPIRVTypeForVReg(ScalarReg);
+}
+
 unsigned
 SPIRVGlobalRegistry::getScalarOrVectorBitWidth(const SPIRVType *Type) const {
   assert(Type && "Invalid Type pointer");
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index a95b488960c4c3..0cf174d0e45afa 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -353,6 +353,9 @@ class SPIRVGlobalRegistry {
   SPIRVType *getSPIRVTypeForVReg(Register VReg,
                                  const MachineFunction *MF = nullptr) const;
 
+  // Return the result type of the instruction defining the register.
+  SPIRVType *getResultType(Register VReg);
+
   // Whether the given VReg has a SPIR-V type mapped to it yet.
   bool hasSPIRVTypeForVReg(Register VReg) const {
     return getSPIRVTypeForVReg(VReg) != nullptr;
@@ -388,6 +391,12 @@ class SPIRVGlobalRegistry {
   unsigned getScalarOrVectorComponentCount(Register VReg) const;
   unsigned getScalarOrVectorComponentCount(SPIRVType *Type) const;
 
+  // Return the component type in a vector if the argument is associated with
+  // a vector type. Returns the argument itself for a scalar type, and nullptr
+  // for a missing type.
+  SPIRVType *getScalarOrVectorComponentType(Register VReg) const;
+  SPIRVType *getScalarOrVectorComponentType(SPIRVType *Type) const;
+
   // For vectors or scalars of booleans, integers and floats, return the scalar
   // type's bitwidth. Otherwise calls llvm_unreachable().
   unsigned getScalarOrVectorBitWidth(const SPIRVType *Type) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index be38b22f70c583..634a8829798dd9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -264,6 +264,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   void selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType,
                                MachineInstr &I) const;
 
+  void selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType,
+                                MachineInstr &I) const;
+
   // Utilities
   Register buildI32Constant(uint32_t Val, MachineInstr &I,
                             const SPIRVType *ResType = nullptr) const;
@@ -288,6 +291,12 @@ class SPIRVInstructionSelector : public InstructionSelector {
                                   uint32_t Binding, uint32_t ArraySize,
                                   Register IndexReg, bool IsNonUniform,
                                   MachineIRBuilder MIRBuilder) const;
+  SPIRVType *getCorrespondingVec4Type(const SPIRVType *Type,
+                                      MachineInstr &I) const;
+  void extractScalarOrVectorFromVector(Register &ResultReg,
+                                       const SPIRVType *ResType,
+                                       Register &InputReg,
+                                       MachineInstr &InsertionPoint) const;
 };
 
 } // end anonymous namespace
@@ -2762,6 +2771,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     selectHandleFromBinding(ResVReg, ResType, I);
     return true;
   }
+  case Intrinsic::spv_typedBufferLoad: {
+    selectReadImageIntrinsic(ResVReg, ResType, I);
+    return true;
+  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
@@ -2798,6 +2811,83 @@ void SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg,
       .addUse(VarReg);
 }
 
+void SPIRVInstructionSelector::selectReadImageIntrinsic(
+    Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const {
+
+  // If the load of the image is in a different basic block, then
+  // this will generate invalid code. A proper solution is to move
+  // the OpLoad from selectHandleFromBinding here. However, to do
+  // that we will need to change the return type of the intrinsic.
+  // We will do that when we can, but for now trying to move forward with other
+  // issues.
+  Register ImageReg = I.getOperand(2).getReg();
+
+  SPIRVType *ReadType = getCorrespondingVec4Type(ResType, I);
+  Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType));
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead))
+      .addDef(ReadReg)
+      .addUse(GR.getSPIRVTypeID(ReadType))
+      .addUse(ImageReg)
+      .addUse(I.getOperand(3).getReg());
+
+  extractScalarOrVectorFromVector(ResVReg, ResType, ReadReg, I);
+}
+
+void SPIRVInstructionSelector::extractScalarOrVectorFromVector(
+    Register &ResultReg, const SPIRVType *ResType, Register &InputReg,
+    MachineInstr &InsertionPoint) const {
+  SPIRVType *InputType = GR.getResultType(InputReg);
+  assert(InputType->getOpcode() == SPIRV::OpTypeVector);
+
+  if (ResType->getOpcode() != SPIRV::OpTypeVector) {
+    assert(ResType == GR.getScalarOrVectorComponentType(InputType));
+    BuildMI(*InsertionPoint.getParent(), InsertionPoint,
+            InsertionPoint.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
+        .addDef(ResultReg)
+        .addUse(GR.getSPIRVTypeID(ResType))
+        .addUse(InputReg)
+        .addImm(0);
+    return;
+  }
+
+  uint64_t InputSize = GR.getScalarOrVectorComponentCount(InputType);
+  uint64_t VectorSize = GR.getScalarOrVectorComponentCount(ResType);
+  if (VectorSize == InputSize) {
+    BuildMI(*InsertionPoint.getParent(), InsertionPoint,
+            InsertionPoint.getDebugLoc(), TII.get(SPIRV::OpCopyObject))
+        .addDef(ResultReg)
+        .addUse(GR.getSPIRVTypeID(ResType))
+        .addUse(InputReg);
+    return;
+  }
+
+  assert(VectorSize < InputSize &&
+         "Cannot extract more element than there are in the input.");
+  SmallVector<Register> ComponentRegisters;
+  SPIRVType *ScalarType = GR.getScalarOrVectorComponentType(ResType);
+  const TargetRegisterClass *ScalarRegClass = GR.getRegClass(ScalarType);
+  for (uint64_t i = 0; i < VectorSize; i++) {
+    Register ComponentReg = MRI->createVirtualRegister(ScalarRegClass);
+    BuildMI(*InsertionPoint.getParent(), InsertionPoint,
+            InsertionPoint.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
+        .addDef(ComponentReg)
+        .addUse(ScalarType->getOperand(0).getReg())
+        .addUse(InputReg)
+        .addImm(i);
+    ComponentRegisters.emplace_back(ComponentReg);
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*InsertionPoint.getParent(), InsertionPoint,
+                                    InsertionPoint.getDebugLoc(),
+                                    TII.get(SPIRV::OpCompositeConstruct))
+                                .addDef(ResultReg)
+                                .addUse(GR.getSPIRVTypeID(ResType));
+
+  for (Register ComponentReg : ComponentRegisters) {
+    MIB.addUse(ComponentReg);
+  }
+}
+
 Register SPIRVInstructionSelector::buildPointerToResource(
     const SPIRVType *ResType, uint32_t Set, uint32_t Binding,
     uint32_t ArraySize, Register IndexReg, bool IsNonUniform,
@@ -3300,6 +3390,24 @@ bool SPIRVInstructionSelector::selectSpvThreadId(Register ResVReg,
   return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
+SPIRVType *
+SPIRVInstructionSelector::getCorrespondingVec4Type(const SPIRVType *Type,
+                                                   MachineInstr &I) const {
+  MachineIRBuilder MIRBuilder(I);
+  if (Type->getOpcode() != SPIRV::OpTypeVector) {
+    return GR.getOrCreateSPIRVVectorType(Type, 4, MIRBuilder);
+  }
+
+  uint64_t VectorSize = Type->getOperand(2).getImm();
+  if (VectorSize == 4) {
+    return Type;
+  }
+
+  Register ScalarTypeReg = Type->getOperand(1).getReg();
+  const SPIRVType *ScalarType = GR.getSPIRVTypeForVReg(ScalarTypeReg);
+  return GR.getOrCreateSPIRVVectorType(ScalarType, 4, MIRBuilder);
+}
+
 namespace llvm {
 InstructionSelector *
 createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index e8641b3a105dec..673fde96ea19fd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -719,6 +719,11 @@ void RequirementHandler::initAvailableCapabilitiesForVulkan(
          Capability::UniformTexelBufferArrayNonUniformIndexingEXT,
          Capability::StorageTexelBufferArrayNonUniformIndexingEXT});
   }
+
+  // Became core in Vulkan 1.3
+  if (ST.isAtLeastSPIRVVer(VersionTuple(1, 6))) {
+    addAvailableCaps({Capability::StorageImageReadWithoutFormat});
+  }
 }
 
 } // namespace SPIRV
@@ -1005,6 +1010,13 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
   }
 }
 
+static bool imageTypeHasUnknownFormat(SPIRVType *TypeInst) {
+  if (TypeInst->getOpcode() != SPIRV::OpTypeImage)
+    return false;
+  assert(TypeInst->getOperand(7).isImm() && "The image format must be an imm.");
+  return TypeInst->getOperand(7).getImm() == 0;
+}
+
 static void AddDotProductRequirements(const MachineInstr &MI,
                                       SPIRV::RequirementHandler &Reqs,
                                       const SPIRVSubtarget &ST) {
@@ -1411,6 +1423,14 @@ void addInstrRequirements(const MachineInstr &MI,
   case SPIRV::OpUDot:
     AddDotProductRequirements(MI, Reqs, ST);
     break;
+  case SPIRV::OpImageRead: {
+    Register ImageReg = MI.getOperand(2).getReg();
+    SPIRVType *TypeDef = ST.getSPIRVGlobalRegistry()->getResultType(ImageReg);
+    if (imageTypeHasUnknownFormat(TypeDef))
+      Reqs.addCapability(SPIRV::Capability::StorageImageReadWithoutFormat);
+    break;
+  }
+
   default:
     break;
   }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
new file mode 100644
index 00000000000000..27c66eeb238a10
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
@@ -0,0 +1,66 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-NOT: OpCapability StorageImageReadWithoutFormat
+
+; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
+; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
+
+; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[zero:%[0-9]+]] = OpConstant [[int]] 0
+; CHECK-DAG: [[v4_int:%[0-9]+]] = OpTypeVector [[int]] 4
+; CHECK-DAG: [[v2_int:%[0-9]+]] = OpTypeVector [[int]] 2
+; CHECK-DAG: [[RWBufferTypeInt:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 2 R32i {{$}}
+; CHECK-DAG: [[IntBufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[RWBufferTypeInt]]
+; CHECK-DAG: [[IntBufferVar]] = OpVariable [[IntBufferPtrType]] UniformConstant
+
+; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
+; CHECK-NEXT: OpLabel
+define void @RWBufferLoad_Vec4_I32() #0 {
+; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
+  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
+      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+          i32 16, i32 7, i32 1, i32 0, i1 false)
+
+; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
+  %data0 = call <4 x i32> @llvm.spv.typedBufferLoad(
+      target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
+
+  ret void
+}
+
+; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
+; CHECK-NEXT: OpLabel
+define void @RWBufferLoad_I32() #0 {
+; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
+  %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
+      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+          i32 16, i32 7, i32 1, i32 0, i1 false)
+
+; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
+; CHECK: OpCompositeExtract [[int]] [[V]] 0
+  %data1 = call i32 @llvm.spv.typedBufferLoad(
+      target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
+
+  ret void
+}
+
+; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
+; CHECK-NEXT: OpLabel
+define void @RWBufferLoad_Vec2_I32() #0 {
+; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
+  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
+      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+          i32 16, i32 7, i32 1, i32 0, i1 false)
+
+; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
+; CHECK: [[e0:%[0-9]+]] = OpCompositeExtract [[int]] [[V]] 0
+; CHECK: [[e1:%[0-9]+]] = OpCompositeExtract [[int]] [[V]] 1
+; CHECK: OpCompositeConstruct [[v2_int]] [[e0]] [[e1]]
+  %data0 = call <2 x i32> @llvm.spv.typedBufferLoad(
+      target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
+
+  ret void
+}
+
+attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/HlslBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
similarity index 100%
rename from llvm/test/CodeGen/SPIRV/hlsl-resources/HlslBufferLoad.ll
rename to llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
new file mode 100644
index 00000000000000..7f9c6f7da2859e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability StorageImageReadWithoutFormat
+; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
+; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
+
+; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[zero:%[0-9]+]] = OpConstant [[int]] 0
+; CHECK-DAG: [[v4_int:%[0-9]+]] = OpTypeVector [[int]] 4
+; CHECK-DAG: [[RWBufferTypeInt:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 2 Unknown {{$}}
+; CHECK-DAG: [[IntBufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[RWBufferTypeInt]]
+; CHECK-DAG: [[IntBufferVar]] = OpVariable [[IntBufferPtrType]] UniformConstant
+
+; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
+; CHECK-NEXT: OpLabel
+define void @RWBufferLoad_Vec4_I32() #0 {
+; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
+  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
+      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_0(
+          i32 16, i32 7, i32 1, i32 0, i1 false)
+
+; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
+  %data0 = call <4 x i32> @llvm.spv.typedBufferLoad(
+      target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %buffer0, i32 0)
+
+  ret void
+}
+
+attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }

>From 9c2483447f2e6e76d2ddf36f427ab1572e2a96b5 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron at google.com>
Date: Wed, 6 Nov 2024 15:57:35 -0500
Subject: [PATCH 02/20] Update SpirvUsage.rst.

---
 llvm/docs/SPIRVUsage.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 0fcaa366c8a3e0..277b9c15292c53 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -392,6 +392,14 @@ SPIR-V backend, along with their descriptions and argument details.
        If `arraySize > 1`, then the binding represents an array of resources\
        of the given size, and the handle for the resource at the given index is returned.\
        If the index is possibly non-uniform, then `isUniformIndex` must get set to true.
+   * - `int_spv_typeBufferLoad`
+     - Scalar or vector
+     - `[spirv.Image ImageBuffer, 32-bit Integer coordinate]`
+     - Loads a value from a Vulkan image buffer at the given coordinate. The \
+       image buffer data is assumed to be stored as a 4-element vector. If the \
+       return type is a scalar, then the first element of the vector is \
+       returned. If the return type is an n-element vector, then the first \
+       n-elements of the 4-element vector are returned.
 
 .. _spirv-builtin-functions:
 

>From 178e1cdb86d9e84e51efb50b1ce73a86fab16798 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron at google.com>
Date: Wed, 6 Nov 2024 15:59:09 -0500
Subject: [PATCH 03/20] Small test fix.

---
 llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
index 27c66eeb238a10..c2749d13c214d2 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
@@ -19,7 +19,7 @@
 define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
@@ -34,7 +34,7 @@ define void @RWBufferLoad_Vec4_I32() #0 {
 define void @RWBufferLoad_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
@@ -50,7 +50,7 @@ define void @RWBufferLoad_I32() #0 {
 define void @RWBufferLoad_Vec2_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]

>From 52296c2f55447c83b4ce0893d41a12c4898edd7d Mon Sep 17 00:00:00 2001
From: Md Asghar Ahmad Shahid <md.asghar.ahmad.shahid at intel.com>
Date: Thu, 7 Nov 2024 20:21:02 +0530
Subject: [PATCH 04/20] [MLIR][Linalg] Re-land linalg.matmul move to ODS. +
 Remove/update failing obsolete OpDSL tests. (#115319)

The earlier PR(https://github.com/llvm/llvm-project/pull/104783) which
introduces
transpose and broadcast semantic to linalg.matmul was reverted due to
two failing
OpDSL test for linalg.matmul.

Since linalg.matmul is now defined using TableGen ODS instead of
Python-based OpDSL,
these test started failing and needs to be removed/updated.

This commit removes/updates the failing obsolete tests from below files.
All other files
were part of earlier PR and just cherry picked.
    "mlir/test/python/integration/dialects/linalg/opsrun.py"
    "mlir/test/python/integration/dialects/transform.py"

---------

Co-authored-by: Renato Golin <rengolin at systemcall.eu>
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     |  10 +
 .../Linalg/IR/LinalgNamedStructuredOps.yaml   |  72 -----
 .../Dialect/Linalg/IR/LinalgStructuredOps.td  | 134 +++++++++
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    |  17 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 263 +++++++++++++++++-
 .../Linalg/Transforms/TransposeMatmul.cpp     |   7 +
 .../Linalg/Transforms/Vectorization.cpp       |   5 +
 .../NVGPU/TransformOps/NVGPUTransformOps.cpp  |   6 +
 .../linalg/opdsl/ops/core_named_ops.py        |  17 --
 .../Dialect/Linalg/generalize-named-ops.mlir  | 111 ++++++++
 mlir/test/Dialect/Linalg/invalid.mlir         | 159 +++++++++++
 mlir/test/Dialect/Linalg/named-ops.mlir       | 243 ++++++++++++++++
 mlir/test/python/dialects/linalg/ops.py       |  75 -----
 .../integration/dialects/linalg/opsrun.py     | 115 --------
 .../python/integration/dialects/transform.py  |  28 +-
 .../mlir-linalg-ods-yaml-gen.cpp              |   6 +-
 16 files changed, 959 insertions(+), 309 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index b81a4c9c8760cf..c0eff99c850752 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -708,6 +708,16 @@ def LinalgStructuredInterface
         return;
       }]
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if the user has supplied an explicit indexing maps for this op.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"hasUserDefinedMaps",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{ return false; }]
+    >,
     //===------------------------------------------------------------------===//
     // Linalg generalization hooks.
     //===------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index bf2f26de26e9ed..ee88ca516de6ff 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -1065,78 +1065,6 @@ structured_op: !LinalgStructuredOpConfig
         - !ScalarExpression
           scalar_arg: rhs
 --- !LinalgOpConfig
-metadata: !LinalgOpMetadata
-  name: matmul
-  cpp_class_name: MatmulOp
-  doc: |-
-    Performs a matrix multiplication of two 2D inputs.
-
-    Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output.
-  implements:
-  - LinalgContractionOpInterface
-structured_op: !LinalgStructuredOpConfig
-  args:
-  - !LinalgOperandDefConfig
-    name: A
-    kind: input_tensor
-    type_var: T1
-    shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)>
-  - !LinalgOperandDefConfig
-    name: B
-    kind: input_tensor
-    type_var: T2
-    shape_map: affine_map<()[s0, s1, s2] -> (s1, s2)>
-  - !LinalgOperandDefConfig
-    name: C
-    kind: output_tensor
-    type_var: U
-    shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)>
-  - !LinalgOperandDefConfig
-    name: cast
-    kind: type_fn_attr
-    default_fn: cast_signed
-  indexing_maps: !LinalgIndexingMapsConfig
-    static_indexing_maps:
-    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)>
-    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)>
-    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)>
-  iterator_types:
-  - parallel
-  - parallel
-  - reduction
-  assignments:
-  - !ScalarAssign
-    arg: C
-    value: !ScalarExpression
-      scalar_fn:
-        kind: binary
-        fn_name: add
-        operands:
-        - !ScalarExpression
-          scalar_arg: C
-        - !ScalarExpression
-          scalar_fn:
-            kind: binary
-            fn_name: mul
-            operands:
-            - !ScalarExpression
-              scalar_fn:
-                kind: type
-                attr_name: cast
-                type_var: U
-                operands:
-                - !ScalarExpression
-                  scalar_arg: A
-            - !ScalarExpression
-              scalar_fn:
-                kind: type
-                attr_name: cast
-                type_var: U
-                operands:
-                - !ScalarExpression
-                  scalar_arg: B
---- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: quantized_matmul
   cpp_class_name: QuantizedMatmulOp
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index c2fee8ea55c960..2b47414ff5e924 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -554,6 +554,140 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
   let hasCanonicalizer = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Op definition for MatmulOp
+//===----------------------------------------------------------------------===//
+
+def MatmulOp : LinalgStructuredBase_Op<"matmul", [
+               AttrSizedOperandSegments,
+               LinalgContractionOpInterface]> {
+    
+  let summary = [{
+    Performs a matrix multiplication of two 2D inputs without broadcast or transpose.
+    }];
+  let description = [{
+    Numeric casting is performed on the operands to the inner multiply,
+    promoting them to the same data type as the accumulator/output.
+
+    Broadcast and Transpose semantics can be appiled by specifying the explicit attribute
+    'indexing_maps' as shown below.This is a list attribute, so the list must include all
+    the maps if specified.
+
+    Example Transpose:
+    ```
+    linalg.matmul indexing_maps = [
+                   affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose
+                   affine_map<(d0, d1, d2) -> (d2, d1)>,
+                   affine_map<(d0, d1, d2) -> (d0, d1)>
+                   ]
+                   ins(%arg0, %arg1 : memref<5x3xf32>,memref<5x7xf32>)
+                   outs(%arg2: memref<3x7xf32>)
+     ```
+
+    Example Broadcast:
+     ```
+    linalg.matmul indexing_maps = [
+                   affine_map<(d0, d1, d2) -> (d2)>,     // broadcast
+                   affine_map<(d0, d1, d2) -> (d2, d1)>,
+                   affine_map<(d0, d1, d2) -> (d0, d1)>
+                  ]
+                  ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>)
+                  outs(%arg2: memref<3x7xf32>)
+     ```
+
+     Example Broadcast and transpose:
+     ```
+     linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose
+                       affine_map<(d0, d1, d2) -> (d2)>,     // broadcast
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
+    }];
+
+    let arguments = (ins
+      Variadic<AnyType>:$inputs,
+      Variadic<AnyShaped>:$outputs,
+      DefaultValuedOptionalAttr<AffineMapArrayAttr, "{}">:$indexing_maps,
+      DefaultValuedOptionalAttr<TypeFnAttr, "TypeFn::cast_signed">:$cast
+    );
+    let results = (outs Variadic<AnyRankedTensor>:$result_tensors);
+    let regions = (region AnyRegion:$region);
+
+    let skipDefaultBuilders = 1;
+    let builders = [
+      OpBuilder<
+      (ins "ValueRange":$inputs, "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs,
+          attributes, MatmulOp::getRegionBuilder());
+      }]>,
+      OpBuilder<
+      (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
+            "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        buildStructuredOp($_builder, $_state, resultTensorTypes,
+          inputs, outputs, attributes, MatmulOp::getRegionBuilder());
+      }]>,
+      OpBuilder<
+      (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        $_state.addOperands(operands);
+        $_state.addAttributes(attributes);
+        $_state.addTypes(resultTensorTypes);
+        (void)$_state.addRegion();
+      }]>,
+      OpBuilder<
+      (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
+       "ValueRange":$outputs,
+       "Attribute":$cast, CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        $_state.addAttribute("cast", cast);
+        buildStructuredOp($_builder, $_state, resultTensorTypes, inputs, outputs,
+          attributes, MatmulOp::getRegionBuilder());
+      }]>
+
+    ];
+    let hasCustomAssemblyFormat = 1;
+    let hasFolder = 1;
+    let hasVerifier = 1;
+
+    let extraClassDeclaration = structuredOpsBaseDecls # [{
+      SmallVector<utils::IteratorType> getIteratorTypesArray();
+
+      /// Implements the block region builder.
+      static void regionBuilder(ImplicitLocOpBuilder &b,
+                                Block &block, ArrayRef<NamedAttribute> attrs);
+
+      /// Returns a list of AffineMap with the typical matmul indexing charactristic.
+      SmallVector<AffineMap> getDefaultIndexingMaps();
+
+      /// Returns true if the given broadcast map \p bcastMap is valid for this op.
+      bool isValidLhsRhsBroadcastMap(AffineMap bcastMap);
+
+      static std::function<void(ImplicitLocOpBuilder &,
+                                Block &, ArrayRef<NamedAttribute>)>
+      getRegionBuilder() {
+        return regionBuilder;
+      }
+
+      ::mlir::MutableOperandRange getDpsInitsMutable() {
+        return getOutputsMutable();
+      }
+
+      // Generic methods.
+      static unsigned getNumRegionArgs();
+      std::string getLibraryCallName();
+      bool hasDynamicIndexingMaps();
+      /// Check if the op has broadcast and/or transpose semantic. Returns true if the
+      /// user defined indexing maps are not equal to default map.
+      bool hasUserDefinedMaps();
+    }];
+}
+
 //===----------------------------------------------------------------------===//
 // Named Linalg ops, implemented as a declarative configurations of generic ops.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index bd77965194b27f..0cffadf8fb64a0 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -15,14 +15,21 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <numeric>
+#include <optional>
 
 using namespace mlir;
 using namespace mlir::linalg;
@@ -1211,7 +1218,6 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) {
 
 LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   LinalgOp linalgOp = cast<LinalgOp>(op);
-
   // Mixed tensor/buffer operands are not allowed.
   if (!linalgOp.hasPureTensorSemantics() &&
       !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0)
@@ -1231,6 +1237,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
            << ") to be equal to the number of input/output operands ("
            << linalgOp->getNumOperands() << ")";
 
+  // Set this flag if this op has user defined maps. This is required to guard
+  // the below error condition which assume default indexing maps.
   for (OpOperand &opOperand : linalgOp->getOpOperands()) {
     AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand);
 
@@ -1247,13 +1255,13 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
              << " dim(s) to match the number of loops";
 
     int64_t rank = linalgOp.getRank(&opOperand);
+
     if (indexingMap.getNumResults() != rank)
       return op->emitOpError("expected operand rank (")
              << rank << ") to match the result rank of indexing_map #"
              << opOperand.getOperandNumber() << " ("
              << indexingMap.getNumResults() << ")";
   }
-
   SmallVector<unsigned> redDims;
   linalgOp.getReductionDims(redDims);
 
@@ -1263,9 +1271,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   // Check if given shapes match to inferred shapes.
   SmallVector<int64_t, 4> endLoopRangeValues = linalgOp.getStaticLoopRanges();
   SmallVector<int64_t, 4> startLoopRangeValues(endLoopRangeValues.size(), 0);
-
-  // Verify only static cases since we can't get exact dimension sizes and loop
-  // ranges for dynamic cases in this stage.
+  // Verify only static cases since we can't get exact dimension sizes and
+  // loop ranges for dynamic cases in this stage.
   if (llvm::none_of(endLoopRangeValues, ShapedType::isDynamic)) {
     for (int64_t &range : endLoopRangeValues)
       range -= 1;
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 730c478c2883ef..c909d13e4314b4 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
@@ -37,12 +38,17 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 #include <optional>
 
 using namespace mlir;
@@ -149,15 +155,36 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region &region,
   // iterator_types is an auto-generated method.
 }
 
+/// Helper to create a typical indexing map for MatmulOp. Returns a list of
+/// AffineMap.
+static SmallVector<AffineMap, 3>
+getDefaultIndexingMapsForMatmul(MLIRContext *context) {
+  AffineExpr d0, d1, d2;
+  SmallVector<AffineMap, 3> indexingMaps;
+  bindDims(context, d0, d1, d2);
+  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context));
+  indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context));
+  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context));
+  return indexingMaps;
+}
+
+/// Wrapper to return the typical indexing map array attribute for MatmulOp.
+static SmallVector<Attribute> getDefaultIndexingMapAttr(MLIRContext *context) {
+  return llvm::map_to_vector(
+      getDefaultIndexingMapsForMatmul(context),
+      [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
+}
+
 /// Creates a structured operation given `inputs`, `outputs`, and `attributes`.
 /// The result types are derived automatically if `resultTensorTypes` is none.
 /// The body of the operation is filled using `regionBuilder`. All ods-gen
 /// created structured operations use the method to implement their builders.
-static void buildStructuredOp(OpBuilder &b, OperationState &state,
-                              std::optional<TypeRange> resultTensorTypes,
-                              ValueRange inputs, ValueRange outputs,
-                              ArrayRef<NamedAttribute> attributes,
-                              RegionBuilderFn regionBuilder) {
+static void buildStructuredOp(
+    OpBuilder &b, OperationState &state,
+    std::optional<TypeRange> resultTensorTypes, ValueRange inputs,
+    ValueRange outputs, ArrayRef<NamedAttribute> attributes,
+    RegionBuilderFn regionBuilder,
+    std::optional<ArrayRef<AffineMap>> indexingMaps = std::nullopt) {
   // Derive the result types if needed.
   SmallVector<Type> derivedResultTypes =
       resultTensorTypes.value_or(TypeRange());
@@ -168,6 +195,20 @@ static void buildStructuredOp(OpBuilder &b, OperationState &state,
   state.addOperands(inputs);
   state.addOperands(outputs);
   state.addTypes(derivedResultTypes);
+
+  // Initialize indexingMaps, for MatmulOp.
+  SmallVector<Attribute, 3> indexingMapsAttrVal;
+  if (indexingMaps.has_value()) {
+    for (mlir::AffineMap map : *indexingMaps) {
+      // Convert each AffineMap to an AffineMapAttr
+      indexingMapsAttrVal.push_back(AffineMapAttr::get(map));
+    }
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  } else {
+    indexingMapsAttrVal = getDefaultIndexingMapAttr(b.getContext());
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  }
+
   state.addAttributes(attributes);
   state.addAttribute(
       "operandSegmentSizes",
@@ -299,11 +340,48 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser,
                                           OperationState &result,
                                           unsigned numRegionArgs,
                                           RegionBuilderFn regionBuilder) {
+
+  SmallVector<Attribute, 3> indexingMapsAttr;
+  Attribute mapAttr;
+  if (succeeded(parser.parseOptionalKeyword("indexing_maps"))) {
+    if (parser.parseEqual())
+      return failure();
+
+    if (parser.parseLSquare())
+      return failure();
+
+    do {
+      if (parser.parseAttribute(mapAttr))
+        return failure();
+      if (!isa<AffineMapAttr>(mapAttr)) {
+        return parser.emitError(parser.getCurrentLocation(),
+                                "expected affine map attribute");
+      }
+      indexingMapsAttr.push_back(mapAttr);
+
+      if (parser.parseOptionalComma())
+        break;
+    } while (true);
+
+    if (parser.parseRSquare())
+      return failure();
+  }
+  // Initialize indexingMaps, if not supplied explicitly.
+  if (indexingMapsAttr.empty()) {
+    indexingMapsAttr = getDefaultIndexingMapAttr(result.getContext());
+  }
+  result.addAttribute("indexing_maps",
+                      parser.getBuilder().getArrayAttr(indexingMapsAttr));
+
   // TODO: Enable when ods-gen supports captures.
   SmallVector<Type, 1> inputTypes, outputTypes;
   if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes))
     return failure();
 
+  // Parse optional attributes.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
   // TODO: consider merging results parsing into region parsing.
   // Need to wait for declarative assembly resolution to decide.
   SmallVector<Type, 1> outputTensorsTypes;
@@ -329,13 +407,9 @@ static void printNamedStructuredOpResults(OpAsmPrinter &p,
 }
 
 static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op,
-                                   ValueRange inputs, ValueRange outputs) {
-  p.printOptionalAttrDict(
-      op->getAttrs(),
-      /*elidedAttrs=*/{"operandSegmentSizes",
-                       // See generated code in
-                       // LinalgNamedStructuredOps.yamlgen.cpp.inc
-                       "linalg.memoized_indexing_maps"});
+                                   ValueRange inputs, ValueRange outputs,
+                                   ArrayRef<StringRef> elidedAttrs = {}) {
+  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
 
   // Printing is shared with generic ops, except for the region and
   // attributes.
@@ -3382,3 +3456,168 @@ Operation *LinalgDialect::materializeConstant(OpBuilder &builder,
                                               Location loc) {
   return arith::ConstantOp::materialize(builder, value, type, loc);
 }
+
+/// Returns true if the result AffineExpr of the \p explicitMap is same as \p
+/// defaultMap.
+static bool isValidResultDimExprs(AffineMap explictMap, AffineMap defaultMap) {
+  auto explicitRange = explictMap.getResults();
+  auto defaultRange = defaultMap.getResults();
+  DenseSet<AffineExpr> explicitSet(explicitRange.begin(), explicitRange.end());
+  DenseSet<AffineExpr> defaultSet(defaultRange.begin(), defaultRange.end());
+  llvm::set_union(explicitSet, defaultSet);
+  return explicitSet == defaultSet;
+}
+
+/// Returns true if the \p explictMap is broadcasted with respect to the
+/// \p defaultMap.
+static bool isBroadcasted(AffineMap explictMap, AffineMap defaultMap) {
+  return explictMap.getNumResults() < defaultMap.getNumResults();
+}
+
+/// Verifies the broadcast and transpose semantic sepecified by the explicit
+/// indexing map for the MatmulOp \p op for each operand specified by \p
+/// opIndex.
+static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp,
+                                                  unsigned opIndex) {
+  SmallVector<AffineMap, 3> opIndexingMaps = matmulOp.getIndexingMapsArray();
+  SmallVector<AffineMap, 3> defaultIndexingMaps =
+      matmulOp.getDefaultIndexingMaps();
+
+  auto opIndexingMap = opIndexingMaps[opIndex];
+  auto defaultIndexingMap = defaultIndexingMaps[opIndex];
+  // Check general validity of indexing map results.
+  if (!isValidResultDimExprs(opIndexingMap, defaultIndexingMap))
+    return matmulOp->emitOpError()
+           << "Unexpected dim expression in map result.";
+
+  // Check if the requested broadcast is valid.
+  if (isBroadcasted(opIndexingMap, defaultIndexingMap)) {
+    if (!matmulOp.isValidLhsRhsBroadcastMap(opIndexingMap)) {
+      return matmulOp->emitOpError()
+             << "Invalid broadcast requested, should be (d2).";
+    }
+    return success();
+  }
+  return success();
+}
+
+namespace mlir {
+namespace linalg {
+//===----------------------------------------------------------------------===//
+// MatMulOp
+//===----------------------------------------------------------------------===//
+SmallVector<utils::IteratorType> MatmulOp::getIteratorTypesArray() {
+  return SmallVector<utils::IteratorType>{utils::IteratorType::parallel,
+                                          utils::IteratorType::parallel,
+                                          utils::IteratorType::reduction};
+}
+
+unsigned MatmulOp::getNumRegionArgs() { return 3; }
+
+std::string MatmulOp::getLibraryCallName() {
+  return generateLibraryCallName(getOperation());
+}
+
+bool MatmulOp::hasDynamicIndexingMaps() { return true; }
+
+/// Check if the op has broadcast and/or transpose semantic. Returns true if the
+/// user defined indexing maps are not equal to default map.
+bool MatmulOp::hasUserDefinedMaps() {
+  SmallVector<AffineMap, 3> defaultMaps = getDefaultIndexingMaps();
+  SmallVector<AffineMap, 3> explicitMaps = getIndexingMapsArray();
+  return defaultMaps != explicitMaps;
+}
+
+/// Implements the block region builder for the MatmulOp. This is called by
+/// 'fillStructuredOpRegion'.
+void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block,
+                             ArrayRef<NamedAttribute> attrs) {
+  assert(3 > 0 && block.getNumArguments() == 3 &&
+         "MatmulOp regionBuilder expects 3 (>=0) args");
+  RegionBuilderHelper helper(b, block);
+  SmallVector<Value> yields;
+
+  TypeFn castVal = TypeFn::cast_signed;
+  auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) {
+    return attr.getName() == "cast";
+  });
+  if (castIter != attrs.end()) {
+    if (auto attr = llvm::dyn_cast<TypeFnAttr>(castIter->getValue()))
+      castVal = attr.getValue();
+  }
+
+  Value value1 = helper.buildTypeFn(castVal, block.getArgument(2).getType(),
+                                    block.getArgument(0));
+  Value value2 = helper.buildTypeFn(castVal, block.getArgument(2).getType(),
+                                    block.getArgument(1));
+  Value value3 = helper.buildBinaryFn(BinaryFn::mul, value1, value2);
+  Value value4 =
+      helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), value3);
+  yields.push_back(value4);
+  helper.yieldOutputs(yields);
+}
+
+/// Returns a list of AffineMap with the typical matmul indexing charactristic.
+SmallVector<AffineMap> MatmulOp::getDefaultIndexingMaps() {
+  MLIRContext *context = this->getContext();
+  return getDefaultIndexingMapsForMatmul(context);
+}
+
+/// Returns true if the given broadcast map \p bcastMap is valid for this op.
+bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) {
+  assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr.");
+  AffineExpr exp = bcastMap.getResult(0);
+  // Invalid map if the common dimension of matmul not found.
+  return exp.isFunctionOfDim(bcastMap.getNumDims() - 1);
+}
+
+ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseNamedStructuredOp(parser, result, MatmulOp::getNumRegionArgs(),
+                                MatmulOp::getRegionBuilder());
+}
+void MatmulOp::print(OpAsmPrinter &p) {
+  SmallVector<StringRef, 3> elidedAttrs = {
+      "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"};
+  printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
+                         elidedAttrs);
+
+  SmallVector<Attribute, 3> indexingMaps =
+      getDefaultIndexingMapAttr(getContext());
+  if (!llvm::equal(getIndexingMaps(), indexingMaps)) {
+    p << " indexing_maps = [";
+    llvm::interleaveComma(getIndexingMaps(), p,
+                          [&](Attribute attr) { p.printAttribute(attr); });
+    p << "]";
+  }
+}
+
+/// Verify the user defined indexing maps.
+LogicalResult MatmulOp::verify() {
+  // Verification of pure matmul is handled by verifyStructuredOpInterface().
+  if (!hasUserDefinedMaps())
+    return success();
+
+  for (unsigned opIndex = 0; opIndex < 2; opIndex++) {
+    if (failed(verifyExtendedMatmulSemantic(*this, opIndex)))
+      return failure();
+  }
+  return success();
+}
+
+LogicalResult MatmulOp::fold(FoldAdaptor, SmallVectorImpl<OpFoldResult> &) {
+  return memref::foldMemRefCast(*this);
+}
+void MatmulOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  if (hasPureTensorSemantics())
+    return;
+  getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
+}
+
+Speculation::Speculatability MatmulOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
+} // namespace linalg
+} // namespace mlir
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
index aa0052ce47fa7b..6b934f7e8157d4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
@@ -31,6 +31,13 @@ using namespace mlir::linalg;
 FailureOr<Operation *> mlir::linalg::transposeMatmul(RewriterBase &rewriter,
                                                      linalg::MatmulOp matmulOp,
                                                      bool transposeLHS) {
+  // Check to not let go the matmul with extended semantic, through this
+  // transform.
+  if (matmulOp.hasUserDefinedMaps()) {
+    return rewriter.notifyMatchFailure(
+        matmulOp, "only matmul ops with non-extended semantics are supported");
+  }
+
   if (!bufferization::hasTensorSemantics(matmulOp))
     return rewriter.notifyMatchFailure(
         matmulOp, "only matmul ops with tensors are supported");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 090e0b46768d7e..757701dc024dfe 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2090,6 +2090,11 @@ vectorizeScalableVectorPrecondition(Operation *op,
       return failure();
   }
 
+  // Check to not let go the matmul with extended semantic, through this
+  // transform.
+  if (linalgOp.hasUserDefinedMaps())
+    return failure();
+
   // Cond 4: Only the following ops are supported in the
   // presence of scalable vectors
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index 0c2275bbc4b224..3c508ed6e324b2 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -821,6 +821,12 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne(
   bool fail = true;
   // TODO: more robust detection of matmulOp, with transposes etc.
   if (isa_and_nonnull<linalg::MatmulOp>(linalgOp.getOperation())) {
+    // Check to not let go the matmul with extended semantic, through this
+    // transform.
+    if (linalgOp.hasUserDefinedMaps()) {
+      return emitSilenceableError()
+             << "only matmul ops with non-extended semantics are supported";
+    }
     Location loc = linalgOp.getLoc();
     // TODO: more robust computation of laneId, for now assume a single warp.
     Value laneId = rewriter.create<gpu::ThreadIdOp>(
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index b45fecd0ee1457..5c1c984b136058 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -383,23 +383,6 @@ def select(
     O[None] = TernaryFn.select(cond[None], lhs[None], rhs[None])
 
 
- at linalg_structured_op
-def matmul(
-    A=TensorDef(T1, S.M, S.K),
-    B=TensorDef(T2, S.K, S.N),
-    C=TensorDef(U, S.M, S.N, output=True),
-    cast=TypeFnAttrDef(default=TypeFn.cast_signed),
-):
-    """Performs a matrix multiplication of two 2D inputs.
-
-    Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output.
-    """
-    domain(D.m, D.n, D.k)
-    implements(ContractionOpInterface)
-    C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n])
-
-
 @linalg_structured_op
 def quantized_matmul(
     A=TensorDef(T1, S.M, S.K),
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index 1e8f1435ca0fa5..aba26c35931fd3 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -29,6 +29,34 @@ func.func @generalize_matmul_buffer(%A : memref<16x8xf32>, %B: memref<8x32xf32>,
 
 // -----
 
+func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL:   func.func @matmul_bcast_a(
+// CHECK-SAME:                              %[[VAL_0:.*]]: memref<5xf32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: memref<5x7xf32>,
+// CHECK-SAME:                              %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) {
+// CHECK:           ^bb0(%[[VAL_3:.*]]: f32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: f32):
+// CHECK:             %[[VAL_6:.*]] = arith.mulf %[[VAL_3]], %[[VAL_4]] : f32
+// CHECK:             %[[VAL_7:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32
+// CHECK:             linalg.yield %[[VAL_7]] : f32
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
 func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>)
                     outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
@@ -891,3 +919,86 @@ func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vec
 
   return %0, %1: tensor<f32>, tensor<vector<2x4xf32>>
 }
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_a_explicit(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: memref<5x7xf32>,
+// CHECK-SAME:                                  %[[VAL_2:.*]]: memref<3x7xf32>) {
+
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
+// CHECK:           arith.mulf
+// CHECK:           arith.addf
+
+func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL:   func.func @matmul_transpose_b_explicit(
+// CHECK-SAME:                                           %[[VAL_0:.*]]: memref<3x5xf32>,
+// CHECK-SAME:                                           %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                           %[[VAL_2:.*]]: memref<3x7xf32>) {
+
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
+// CHECK:           arith.mulf
+// CHECK:           arith.addf
+
+func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_a_b_explicit(
+// CHECK-SAME:                                             %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                             %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                             %[[VAL_2:.*]]: memref<3x7xf32>) {
+
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
+// CHECK:           arith.mulf
+// CHECK:           arith.addf
+
+func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 4b5a66f8fb5b92..a59472377a732c 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -370,6 +370,165 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>,
 
 // -----
 
+func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
+  // expected-error @+1 {{expected attribute value}}
+  linalg.matmul indexing_maps = [
+                       ,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>)
+                      outs(%arg2 :memref<2x4xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_dim_a(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) {
+  // expected-error @+1 {{Unexpected dim expression in map result}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_dim_b(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) {
+  // expected-error @+1 {{Unexpected dim expression in map result}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_transpose_a_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> {
+  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 1}}
+  %0 = linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>)
+                      outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
+  return %0: tensor<4x64xf32>
+}
+
+// -----
+
+func.func @invalid_transpose_b_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> {
+  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #1 to be 1, but found 64}}
+  %0 = linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>)
+                      outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
+  return %0: tensor<4x64xf32>
+}
+
+// -----
+
+func.func @invalid_bcast_a(%arg0: memref<3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_bcast_a_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #0 (1)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_bcast_b_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #1 (1)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 5, but found 7}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_bcast_b_transpose_a_wrong_dim(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op Unexpected dim expression in map result.}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_indexing_maps_placement_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) {
+  // expected-error @+2 {{custom op 'indexing_maps' is unknown (tried 'func.indexing_maps' as well)}}
+  linalg.matmul ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) outs(%init : tensor<4x64xf32>)
+                        indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+  return
+}
+
+// -----
+
 func.func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) {
   // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}}
   linalg.conv_2d_nhwc_hwcf
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 02ecbed232c8b5..65c18de8424771 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1201,6 +1201,249 @@ func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %a
 
 // -----
 
+// CHECK-LABEL: func @matmul_transpose_a_explicit
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
+func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_b_explicit(
+// CHECK-SAME:                                           %[[VAL_0:.*]]: memref<3x5xf32>,
+// CHECK-SAME:                                           %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                           %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_a_b_explicit(
+// CHECK-SAME:                                             %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                             %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                             %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_a
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_a_dim1
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_b
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_bcast_a_b(
+// CHECK-SAME:                                %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>,
+// CHECK-SAME:                                %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_b_dim1
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @dynamic_matmul_bcast_a(%arg0: memref<?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<?xf32>, memref<?x?xf32>) outs(%arg2: memref<?x?xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @dynamic_matmul_bcast_a(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: memref<?xf32>,
+// CHECK-SAME:                                      %[[VAL_1:.*]]: memref<?x?xf32>,
+// CHECK-SAME:                                      %[[VAL_2:.*]]: memref<?x?xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<?xf32>, memref<?x?xf32>) outs(%[[VAL_2]] : memref<?x?xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_bcast_a_transpose_b(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: memref<5xf32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                  %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_bcast_b_transpose_a(
+// CHECK-SAME:                                          %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                          %[[VAL_1:.*]]: memref<5xf32>,
+// CHECK-SAME:                                          %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
 // CHECK-LABEL: func @matmul_transpose_b
 //       CHECK:   linalg.matmul_transpose_b
 //  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>)
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
index 3bfbcf7d7f7c81..72045a07b2da80 100644
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -84,81 +84,6 @@ def named_form(lhs, rhs):
 
     print(module)
 
-
-# CHECK-LABEL: TEST: testNamedStructuredOpGenericForm
- at run
-def testNamedStructuredOpGenericForm():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
-            )
-            def named_form(lhs, rhs):
-                init_result = tensor.empty([4, 8], f32)
-                #      CHECK: "linalg.matmul"(%{{.*}})
-                # CHECK-SAME:    cast = #linalg.type_fn<cast_signed>
-                # CHECK-SAME:    operandSegmentSizes = array<i32: 2, 1>
-                # CHECK-NEXT:  ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32):
-                # CHECK-NEXT:    arith.mulf{{.*}} (f32, f32) -> f32
-                # CHECK-NEXT:    arith.addf{{.*}} (f32, f32) -> f32
-                # CHECK-NEXT:    linalg.yield{{.*}} (f32) -> ()
-                # CHECK-NEXT: (tensor<4x16xf32>, tensor<16x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
-                return linalg.matmul(lhs, rhs, outs=[init_result])
-
-    module.operation.print(print_generic_op_form=True)
-
-
-# CHECK-LABEL: TEST: testNamedStructuredAsGenericOp
- at run
-def testNamedStructuredAsGenericOp():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
-            )
-            def generic_form(lhs, rhs):
-                init_result = tensor.EmptyOp([4, 8], f32)
-                # CHECK: linalg.generic
-                return linalg.matmul(
-                    lhs, rhs, outs=[init_result.result], emit_generic=True
-                )
-
-    print(module)
-
-
-# CHECK-LABEL: TEST: testOpResultFromOtherOp
- at run
-def testOpResultFromOtherOp():
-    with Context(), Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
-            )
-            def pass_an_op_directly(arg0, arg1):
-                one = arith.ConstantOp(F32Type.get(), 1.0)
-                # CHECK: %[[LHS:.*]] = linalg.fill
-                lhs = linalg.fill(one, outs=[arg0])
-                # CHECK: %[[RHS:.*]] = linalg.fill
-                rhs = linalg.fill(one, outs=[arg1])
-                # CHECK: %[[INIT:.*]] = tensor.empty
-                init = tensor.EmptyOp([4, 8], f32)
-                # CHECK: linalg.matmul
-                # CHECK: ins(%[[LHS]], %[[RHS]]
-                # CHECK: outs(%[[INIT]]
-                return linalg.matmul(lhs, rhs, outs=init)
-
-    print(module)
-
-
 # CHECK-LABEL: TEST: testIdentityRegionOps
 @run
 def testIdentityRegionOps():
diff --git a/mlir/test/python/integration/dialects/linalg/opsrun.py b/mlir/test/python/integration/dialects/linalg/opsrun.py
index f6519fb17a6b98..f77900bc277736 100644
--- a/mlir/test/python/integration/dialects/linalg/opsrun.py
+++ b/mlir/test/python/integration/dialects/linalg/opsrun.py
@@ -50,37 +50,6 @@ def log(*args):
 }
 """
 
-matmul_boiler = """
-func.func @main() -> f32 attributes {llvm.emit_c_interface} {
-  %v0 = arith.constant 0.0 : f32
-  %v1 = arith.constant -1 : i8
-  %v2 = arith.constant 2.0 : f32
-
-  %A = memref.alloc() : memref<4x16xi8>
-  %B = memref.alloc() : memref<16x8xf32>
-  %C0 = memref.alloc() : memref<4x8xf32>
-  %C1 = memref.alloc() : memref<4x8xf32>
-  linalg.fill ins(%v1 : i8) outs(%A : memref<4x16xi8>)
-  linalg.fill ins(%v2 : f32) outs(%B : memref<16x8xf32>)
-  linalg.fill ins(%v0 : f32) outs(%C0 : memref<4x8xf32>)
-  linalg.fill ins(%v0 : f32) outs(%C1 : memref<4x8xf32>)
-
-  call @matmul_signed_on_buffers(%A, %B, %C0) :
-    (memref<4x16xi8>, memref<16x8xf32>, memref<4x8xf32>) -> ()
-  call @matmul_unsigned_on_buffers(%A, %B, %C1) :
-    (memref<4x16xi8>, memref<16x8xf32>, memref<4x8xf32>) -> ()
-
-  %c0 = arith.constant 0 : index
-  %res0 = memref.load %C0[%c0, %c0] : memref<4x8xf32>
-  %res1 = memref.load %C1[%c0, %c0] : memref<4x8xf32>
-
-  %0 = arith.addf %res0, %res1 : f32
-
-  // TODO: FFI-based solution to allow testing and printing with python code.
-  return %0 : f32
-}
-"""
-
 fill_boiler = """
 func.func @main() -> i32 attributes {llvm.emit_c_interface} {
   %O0 = memref.alloc() : memref<i32>
@@ -296,90 +265,6 @@ def elemwise_log_mul_on_buffers(lhs, rhs, out):
 test_elemwise_generic()
 
 
-def test_matmul_builtin():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        i8 = IntegerType.get_signless(8)
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_signed_on_buffers(lhs, rhs, out):
-                linalg.matmul(lhs, rhs, outs=[out])
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_unsigned_on_buffers(lhs, rhs, out):
-                linalg.matmul(lhs, rhs, outs=[out], cast=TypeFn.cast_unsigned)
-
-        execution_engine = ExecutionEngine(transform(module, matmul_boiler))
-
-        # TODO: FFI-based solution to allow testing and printing with python code.
-        # Prepare arguments: one result f32.
-        # Arguments must be passed as pointers.
-        c_float_p = ctypes.c_float * 1
-        res = c_float_p(-1.0)
-        execution_engine.invoke("main", res)
-
-        log("RESULT: ", res[0])
-        # matmul_signed_on_buffers: -1 * 2.0 * 16 = -32
-        # matmul_unsigned_on_buffers: (2^8-1) * 2.0 * 16 = 8160
-        # CHECK: RESULT: 8128
-
-
-test_matmul_builtin()
-
-
-def test_matmul_generic():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        i8 = IntegerType.get_signless(8)
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_signed_on_buffers(lhs, rhs, out):
-                linalg.matmul(lhs, rhs, outs=[out], emit_generic=True)
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_unsigned_on_buffers(lhs, rhs, out):
-                linalg.matmul(
-                    lhs, rhs, outs=[out], cast=TypeFn.cast_unsigned, emit_generic=True
-                )
-
-        execution_engine = ExecutionEngine(transform(module, matmul_boiler))
-
-        # TODO: FFI-based solution to allow testing and printing with python code.
-        # Prepare arguments: one result f32.
-        # Arguments must be passed as pointers.
-        c_float_p = ctypes.c_float * 1
-        res = c_float_p(-1.0)
-        execution_engine.invoke("main", res)
-
-        log("RESULT: ", res[0])
-        # matmul_signed_on_buffers = -1 * 2.0 * 16 = -32
-        # matmul_unsigned_on_buffers = (2^8-1) * 2.0 * 16 = 8160
-        # CHECK: RESULT: 8128
-
-
-test_matmul_generic()
-
-
 def test_fill_builtin():
     with Context() as ctx, Location.unknown():
         module = Module.create()
diff --git a/mlir/test/python/integration/dialects/transform.py b/mlir/test/python/integration/dialects/transform.py
index bc88a61314d0d8..303274a8f88287 100644
--- a/mlir/test/python/integration/dialects/transform.py
+++ b/mlir/test/python/integration/dialects/transform.py
@@ -99,26 +99,28 @@ def basic(target: any_op_t()):
 # CHECK-LABEL: TEST: test_apply_patterns
 @construct_and_print_in_module
 def test_apply_patterns(module_):
-    M, N, K = 3, 5, 3
+    b, M, N, K = 1, 3, 5, 3
 
-    # CHECK-LABEL:   func.func @matmul(
-    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK-LABEL:   func.func @batch_reduce_matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<1x3x5xf32>,
+    # CHECK-SAME:                      %[[VAL_1:.*]]: tensor<1x5x3xf32>,
+    # CHECK-SAME:                      %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
     # CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
     # CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : i32
-    # CHECK:           %[[VAL_5:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK:           %[[VAL_5:.*]] = linalg.batch_reduce_matmul ins(%[[VAL_0]], %[[VAL_1]] : tensor<1x3x5xf32>, tensor<1x5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
     # CHECK:           return %[[VAL_5]] : tensor<3x3xf32>
     # CHECK:         }
     @func.func(
-        T.tensor(M, N, T.f32()), T.tensor(N, K, T.f32()), T.tensor(M, K, T.f32())
+        T.tensor(b, M, N, T.f32()), T.tensor(b, N, K, T.f32()), T.tensor(M, K, T.f32())
     )
-    def matmul(A, B, C):
+    def batch_reduce_matmul(A, B, C):
         i = arith.constant(T.i32(), 1)
         v = arith.addi(i, i)
-        return linalg.matmul(A, B, outs=[C])
+        return linalg.batch_reduce_matmul(A, B, outs=[C])
 
     # CHECK-LABEL:   module attributes {transform.with_named_sequence} {
     # CHECK:           transform.named_sequence @__transform_main(%[[VAL_0:.*]]: !transform.any_op) {
-    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["linalg.matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["linalg.batch_reduce_matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
     # CHECK:             %[[VAL_2:.*]] = transform.get_parent_op %[[VAL_1]] {op_name = "func.func"} : (!transform.any_op) -> !pdl.operation
     # CHECK:             transform.apply_patterns to %[[VAL_2]] {
     # CHECK:               transform.apply_patterns.canonicalization
@@ -132,7 +134,9 @@ def matmul(A, B, C):
     def mod():
         @named_sequence("__transform_main", [any_op_t()], [])
         def basic(variant_op: any_op_t()):
-            matmul = structured_match(any_op_t(), variant_op, ops=["linalg.matmul"])
+            matmul = structured_match(
+                any_op_t(), variant_op, ops=["linalg.batch_reduce_matmul"]
+            )
             top_func = get_parent_op(pdl.op_t(), matmul, op_name="func.func")
 
             @apply_patterns(top_func)
@@ -147,9 +151,9 @@ def pats():
     pm = PassManager.parse("builtin.module(transform-interpreter)")
     pm.run(module_.operation)
 
-    # CHECK-LABEL:   func.func @matmul(
-    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
-    # CHECK:           %[[VAL_3:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK-LABEL:   func.func @batch_reduce_matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<1x3x5xf32>, %[[VAL_1:.*]]: tensor<1x5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK:           %[[VAL_3:.*]] = linalg.batch_reduce_matmul ins(%[[VAL_0]], %[[VAL_1]] : tensor<1x3x5xf32>, tensor<1x5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
     # CHECK:           return %[[VAL_3]] : tensor<3x3xf32>
     # CHECK:         }
     print(module_)
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index 5f86c0cd747077..6be7d4320c6562 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -678,7 +678,11 @@ ParseResult {0}::parse(OpAsmParser &parser, OperationState &result) {{
     {0}::getNumRegionArgs(), {0}::getRegionBuilder());
 }
 void {0}::print(OpAsmPrinter &p) {{
-  ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs());
+  SmallVector<StringRef, 3> elidedAttrs = {{"operandSegmentSizes",
+                                           "linalg.memoized_indexing_maps",
+                                           "indexing_maps"};
+  ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
+                           elidedAttrs);
 }
 )FMT";
 

>From 05b213b1fd0e63be274775714389dd389f567f36 Mon Sep 17 00:00:00 2001
From: weiwei chen <weiwei.chen at modular.com>
Date: Thu, 7 Nov 2024 10:05:19 -0500
Subject: [PATCH 05/20] [Backend] Add clearSubtargetMap API for TargetMachine.
 (#112383)

- [x] Add `clearSubtargetInfo` API to TargetMachine and each backend to
make it possible to release memory used in each backend's
`SubtargetInfo` map if needed. Keep this API as `protected` so that it
will be used with precautions.
---
 llvm/include/llvm/Target/TargetMachine.h         | 3 +++
 llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 2 ++
 llvm/lib/Target/AArch64/AArch64TargetMachine.h   | 3 +++
 llvm/lib/Target/ARM/ARMTargetMachine.cpp         | 2 ++
 llvm/lib/Target/ARM/ARMTargetMachine.h           | 3 +++
 llvm/lib/Target/X86/X86TargetMachine.cpp         | 2 ++
 llvm/lib/Target/X86/X86TargetMachine.h           | 3 +++
 7 files changed, 18 insertions(+)

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index c3e9d41315f617..fa3ab58a21ddcb 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -448,6 +448,9 @@ class LLVMTargetMachine : public TargetMachine {
 
   void initAsmInfo();
 
+  /// Reset internal state.
+  virtual void reset() {};
+
 public:
   /// Get a TargetTransformInfo implementation for the target.
   ///
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index c7bd0390b65620..e24a874e74970f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -272,6 +272,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64GlobalsTaggingPass(*PR);
 }
 
+void AArch64TargetMachine::reset() { SubtargetMap.clear(); }
+
 //===----------------------------------------------------------------------===//
 // AArch64 Lowering public interface.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 1a470ca87127ce..e4d0aff50d8f67 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -26,6 +26,9 @@ class AArch64TargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
 
+  /// Reset internal state.
+  void reset() override;
+
 public:
   AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index a58c63dcf762d1..74a63361d341b7 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -642,3 +642,5 @@ bool ARMBaseTargetMachine::parseMachineFunctionInfo(
   MF.getInfo<ARMFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
   return false;
 }
+
+void ARMBaseTargetMachine::reset() { SubtargetMap.clear(); }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index 75ee50f0e93c88..5b3594a4dcca8a 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -38,6 +38,9 @@ class ARMBaseTargetMachine : public LLVMTargetMachine {
   bool isLittle;
   mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
 
+  /// Reset internal state.
+  void reset() override;
+
 public:
   ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 4ba0ac11d20953..dcdd059b4c0cf9 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -376,6 +376,8 @@ bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
   return SrcAS < 256 && DestAS < 256;
 }
 
+void X86TargetMachine::reset() { SubtargetMap.clear(); }
+
 //===----------------------------------------------------------------------===//
 // X86 TTI query.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index ec4a93e9c9d4b0..d27c8a4d101165 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -31,6 +31,9 @@ class X86TargetMachine final : public LLVMTargetMachine {
   // True if this is used in JIT.
   bool IsJIT;
 
+  /// Reset internal state.
+  void reset() override;
+
 public:
   X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                    StringRef FS, const TargetOptions &Options,

>From e3a58e9199ca38a28afcd2d1ce27b082a8b76d02 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 7 Nov 2024 06:17:06 -0800
Subject: [PATCH 06/20] [SLP][NFC]Add a test with the segmented loads, NFC

---
 .../SLPVectorizer/RISCV/segmented-stores.ll   | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll
new file mode 100644
index 00000000000000..ae1c3e1ee0da20
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-stores.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v -pass-remarks-output=%t -mcpu=sifive-p670 < %s | FileCheck %s
+; RUN: FileCheck %s --check-prefix YAML --input-file=%t
+
+; YAML-LABEL: --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            StoresVectorized
+; YAML-NEXT: Function:        test
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-1'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '2'
+define void @test(ptr %h) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[H:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DCT2X211:%.*]] = alloca [0 x [0 x [8 x i64]]], i32 0, align 16
+; CHECK-NEXT:    [[CHROMA_DC209:%.*]] = getelementptr i8, ptr [[H]], i64 0
+; CHECK-NEXT:    [[ARRAYIDX33_I:%.*]] = getelementptr i8, ptr [[DCT2X211]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX36_I181:%.*]] = getelementptr i8, ptr [[DCT2X211]], i64 24
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 4 [[DCT2X211]], i64 16, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[CHROMA_DC209]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX33_I]], align 2
+; CHECK-NEXT:    [[ARRAYIDX5_I226:%.*]] = getelementptr i8, ptr [[H]], i64 16
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[ARRAYIDX5_I226]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX36_I181]], align 2
+; CHECK-NEXT:    [[ARRAYIDX7_I228:%.*]] = getelementptr i8, ptr [[H]], i64 24
+; CHECK-NEXT:    store i64 [[TMP3]], ptr [[ARRAYIDX7_I228]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dct2x211 = alloca [0 x [0 x [8 x i64]]], i32 0, align 16
+  %chroma_dc209 = getelementptr i8, ptr %h, i64 0
+  %arrayidx30.i = getelementptr i8, ptr %dct2x211, i64 16
+  %arrayidx33.i = getelementptr i8, ptr %dct2x211, i64 8
+  %arrayidx36.i181 = getelementptr i8, ptr %dct2x211, i64 24
+  %0 = load i64, ptr %dct2x211, align 16
+  store i64 %0, ptr %chroma_dc209, align 2
+  %1 = load i64, ptr %arrayidx30.i, align 4
+  %arrayidx3.i224 = getelementptr i8, ptr %h, i64 8
+  store i64 %1, ptr %arrayidx3.i224, align 2
+  %2 = load i64, ptr %arrayidx33.i, align 2
+  %arrayidx5.i226 = getelementptr i8, ptr %h, i64 16
+  store i64 %2, ptr %arrayidx5.i226, align 2
+  %3 = load i64, ptr %arrayidx36.i181, align 2
+  %arrayidx7.i228 = getelementptr i8, ptr %h, i64 24
+  store i64 %3, ptr %arrayidx7.i228, align 2
+  ret void
+}
+

>From 3d2f901f4c3a3fe4d75f389e14bce020832b847a Mon Sep 17 00:00:00 2001
From: Thomas Fransham <tfransham at gmail.com>
Date: Thu, 7 Nov 2024 15:11:40 +0000
Subject: [PATCH 07/20] [ORC] Switch to new visibility macros for JIT debug
 symbols (#113848)

Use LLVM_ALWAYS_EXPORT for __jit_debug_descriptor and
__jit_debug_register_code so there exported even if LLVM is not built as
a shared library.

This is part of the work to enable LLVM_BUILD_LLVM_DYLIB and plugins on
windows #109483.
---
 llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp  | 5 +++--
 llvm/tools/lli/lli.cpp                                       | 3 ++-
 .../llvm-jitlink-executor/llvm-jitlink-executor.cpp          | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
index 7529d9cef67ed5..6347032f010bef 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
 
 #include <cstdint>
@@ -26,13 +27,13 @@ extern "C" {
 // We put information about the JITed function in this global, which the
 // debugger reads.  Make sure to specify the version statically, because the
 // debugger checks the version before we can set it during runtime.
-LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
+LLVM_ALWAYS_EXPORT
 struct jit_descriptor __jit_debug_descriptor = {JitDescriptorVersion, 0,
                                                 nullptr, nullptr};
 
 // Debuggers that implement the GDB JIT interface put a special breakpoint in
 // this function.
-LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
+LLVM_ALWAYS_EXPORT
 LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() {
   // The noinline and the asm prevent calls to this function from being
   // optimized out.
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 4c023bcdd61658..540c43889da758 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Format.h"
@@ -751,7 +752,7 @@ int main(int argc, char **argv, char * const *envp) {
 
 // JITLink debug support plugins put information about JITed code in this GDB
 // JIT Interface global from OrcTargetProcess.
-extern "C" struct jit_descriptor __jit_debug_descriptor;
+extern "C" LLVM_ABI struct jit_descriptor __jit_debug_descriptor;
 
 static struct jit_code_entry *
 findNextDebugDescriptorEntry(struct jit_code_entry *Latest) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index 73a50deb152d84..86b89a38c17601 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Error.h"
@@ -117,7 +118,7 @@ int openListener(std::string Host, std::string PortStr) {
 
 // JITLink debug support plugins put information about JITed code in this GDB
 // JIT Interface global from OrcTargetProcess.
-extern "C" struct jit_descriptor __jit_debug_descriptor;
+extern "C" LLVM_ABI struct jit_descriptor __jit_debug_descriptor;
 
 static void *findLastDebugDescriptorEntryPtr() {
   struct jit_code_entry *Last = __jit_debug_descriptor.first_entry;

>From 8b83658ce9557be5c467b63f8ac6060ebedf0ae2 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis at chromium.org>
Date: Thu, 7 Nov 2024 10:16:13 -0500
Subject: [PATCH 08/20] [gn] port 4d4024e1edf3

---
 llvm/utils/gn/secondary/lldb/test/BUILD.gn | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index cb6380882e7cd7..69950f94836580 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -150,7 +150,10 @@ write_lit_cfg("lit_unit_site_cfg") {
   # Fully-qualified instead of relative for LIT_SITE_CFG_IN_HEADER.
   input = "//lldb/test/Unit/lit.site.cfg.py.in"
   output = lldb_lit_unit_site_cfg_file
-  extra_values = [ "LLVM_BUILD_MODE=." ]
+  extra_values = [
+    "LLVM_BUILD_MODE=.",
+    "LLVM_USE_SANITIZER=",
+  ]
 }
 
 action("lit-lldb-init-quiet") {

>From 3b878238193e621ed914e4e21aa9d8a13144db90 Mon Sep 17 00:00:00 2001
From: Devajith <devajith.valaparambil.sreeramaswamy at cern.ch>
Date: Thu, 7 Nov 2024 16:18:07 +0100
Subject: [PATCH 09/20] [lineeditor] Add `setHistorySize()` method for
 adjusting history size (#110092)

This patch adds a `setHistorySize` method to `LineEditor`.

This is particularly useful for tools like `clang-repl`, `clang-query`,
`mlir-query`, and other REPL interfaces, where managing history size
might be needed.
---
 llvm/include/llvm/LineEditor/LineEditor.h | 1 +
 llvm/lib/LineEditor/LineEditor.cpp        | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/LineEditor/LineEditor.h b/llvm/include/llvm/LineEditor/LineEditor.h
index 29fdf9f85c6fa5..ca0312bf6297f5 100644
--- a/llvm/include/llvm/LineEditor/LineEditor.h
+++ b/llvm/include/llvm/LineEditor/LineEditor.h
@@ -41,6 +41,7 @@ class LineEditor {
 
   void saveHistory();
   void loadHistory();
+  void setHistorySize(int size);
 
   static std::string getDefaultHistoryPath(StringRef ProgName);
 
diff --git a/llvm/lib/LineEditor/LineEditor.cpp b/llvm/lib/LineEditor/LineEditor.cpp
index d0d138bb1f9d8d..8f667b0238df8d 100644
--- a/llvm/lib/LineEditor/LineEditor.cpp
+++ b/llvm/lib/LineEditor/LineEditor.cpp
@@ -20,6 +20,7 @@
 #endif
 
 using namespace llvm;
+constexpr int DefaultHistorySize = 800;
 
 std::string LineEditor::getDefaultHistoryPath(StringRef ProgName) {
   SmallString<32> Path;
@@ -220,8 +221,8 @@ LineEditor::LineEditor(StringRef ProgName, StringRef HistoryPath, FILE *In,
            NULL); // Fix the delete key.
   ::el_set(Data->EL, EL_CLIENTDATA, Data.get());
 
+  setHistorySize(DefaultHistorySize);
   HistEvent HE;
-  ::history(Data->Hist, &HE, H_SETSIZE, 800);
   ::history(Data->Hist, &HE, H_SETUNIQUE, 1);
   loadHistory();
 }
@@ -248,6 +249,11 @@ void LineEditor::loadHistory() {
   }
 }
 
+void LineEditor::setHistorySize(int size) {
+  HistEvent HE;
+  ::history(Data->Hist, &HE, H_SETSIZE, size);
+}
+
 std::optional<std::string> LineEditor::readLine() const {
   // Call el_gets to prompt the user and read the user's input.
   int LineLen = 0;
@@ -291,6 +297,7 @@ LineEditor::~LineEditor() {
 
 void LineEditor::saveHistory() {}
 void LineEditor::loadHistory() {}
+void LineEditor::setHistorySize(int size) {}
 
 std::optional<std::string> LineEditor::readLine() const {
   ::fprintf(Data->Out, "%s", Prompt.c_str());

>From c868d57053560ba8850b456a0335c856f0839a8c Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis at chromium.org>
Date: Thu, 7 Nov 2024 10:25:12 -0500
Subject: [PATCH 10/20] [gn] port c6f3b7bcd0596d3 (libc++ __config_site HAS_NO
 -> HAS)

---
 .../gn/secondary/libcxx/include/BUILD.gn      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 776f1d32c5f520..8a9aff9c05d185 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -19,23 +19,23 @@ if (current_toolchain == default_toolchain) {
       "_LIBCPP_ABI_FORCE_ITANIUM=",
       "_LIBCPP_ABI_FORCE_MICROSOFT=",
       "_LIBCPP_EXTRA_SITE_DEFINES=",
-      "_LIBCPP_HAS_NO_FILESYSTEM=",
-      "_LIBCPP_HAS_NO_THREADS=",
-      "_LIBCPP_HAS_NO_MONOTONIC_CLOCK=",
+      "_LIBCPP_HAS_FILESYSTEM=1",
+      "_LIBCPP_HAS_THREADS=1",
+      "_LIBCPP_HAS_MONOTONIC_CLOCK=1",
       "_LIBCPP_HAS_MUSL_LIBC=",
       "_LIBCPP_HAS_THREAD_API_PTHREAD=",
       "_LIBCPP_HAS_THREAD_API_EXTERNAL=",
       "_LIBCPP_HAS_THREAD_API_WIN32=",
       "_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=",
-      "_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS=1",
+      "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS=",
       "_LIBCPP_NO_VCRUNTIME=",
       "_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=",
-      "_LIBCPP_HAS_NO_RANDOM_DEVICE=",
-      "_LIBCPP_HAS_NO_LOCALIZATION=",
-      "_LIBCPP_HAS_NO_UNICODE=",
-      "_LIBCPP_HAS_NO_WIDE_CHARACTERS=",
+      "_LIBCPP_HAS_RANDOM_DEVICE=1",
+      "_LIBCPP_HAS_LOCALIZATION=1",
+      "_LIBCPP_HAS_UNICODE=1",
+      "_LIBCPP_HAS_WIDE_CHARACTERS=1",
       "_LIBCPP_HAS_NO_STD_MODULES=",
-      "_LIBCPP_HAS_NO_TERMINAL=",
+      "_LIBCPP_HAS_TERMINAL=1",
       "_LIBCPP_INSTRUMENTED_WITH_ASAN=",
       "_LIBCPP_ABI_DEFINES=",
       "_LIBCPP_HARDENING_MODE_DEFAULT=_LIBCPP_HARDENING_MODE_NONE",
@@ -53,10 +53,10 @@ if (current_toolchain == default_toolchain) {
     } else {
       values += [ "_LIBCPP_ABI_NAMESPACE=__" + libcxx_abi_version ]
     }
-    if (!libcxx_enable_time_zone_database) {
-      values += [ "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE=1" ]
+    if (libcxx_enable_time_zone_database) {
+      values += [ "_LIBCPP_HAS_TIME_ZONE_DATABASE=1" ]
     } else {
-      values += [ "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE=" ]
+      values += [ "_LIBCPP_HAS_TIME_ZONE_DATABASE=" ]
     }
   }
 

>From 7df4bd50b3c76984a692de8508fc10b777c58ca7 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Thu, 7 Nov 2024 15:26:10 +0000
Subject: [PATCH 11/20] [lldb][test] Disable new dwarf5 integral member tests
 on Windows

Added by https://github.com/llvm/llvm-project/pull/111859

Due to:
https://lab.llvm.org/buildbot/#/builders/141/builds/3691

This is not uncommon with DWARF testing on Windows. We may be
discarding the required information during linking.

I will look into it next week.
---
 .../TestConstStaticIntegralMember.py                          | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
index e3d004b38a1571..23349f7e89acae 100644
--- a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
+++ b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
@@ -142,6 +142,8 @@ def check_inline_static_members(self, flags):
             "ClassWithConstexprs::scoped_enum_val", "ScopedEnum", "scoped_enum_case2"
         )
 
+    # Fails on Windows for unknown reasons.
+    @skipIfWindows
     # On linux this passes due to the manual index
     @expectedFailureDarwin(debug_info=no_match(["dsym"]))
     def test_inline_static_members_dwarf5(self):
@@ -193,6 +195,8 @@ def check_shadowed_static_inline_members(self, flags):
         self.expect_expr("ns::Foo::mem", result_value="10")
         self.expect_expr("::Foo::mem", result_value="-29")
 
+    # Fails on Windows for unknown reasons.
+    @skipIfWindows
     # On linux this passes due to the manual index
     @expectedFailureDarwin(debug_info=no_match(["dsym"]))
     def test_shadowed_static_inline_members_dwarf5(self):

>From 6f115bb01688ea1794aee33a878d9efcee457d56 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Thu, 7 Nov 2024 10:34:00 -0500
Subject: [PATCH 12/20] [C2y] Implement WG14 N3344 (#115313)

https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3344.pdf

This paper disallows a single `void` parameter from having qualifiers or
storage class specifiers. Clang has diagnosed most of these as an error
for a long time, but `register void` was previously accepted in all C
language modes and is now being rejected in all C language modes.
---
 clang/docs/ReleaseNotes.rst |  6 ++++++
 clang/lib/Sema/SemaDecl.cpp | 18 +++++++++++++++---
 clang/test/C/C2y/n3344.c    | 28 ++++++++++++++++++++++++++++
 clang/www/c_status.html     |  2 +-
 4 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/C/C2y/n3344.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 061a1fed3f7d48..3e7d8e15110f9d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -280,6 +280,12 @@ C2y Feature Support
   this is now a C2y extension in C. ``-Wgnu-case-range`` still applies in C++
   modes.
 
+- Clang implemented support for `N3344 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3344.pdf>`_
+  which disallows a ``void`` parameter from having a qualifier or storage class
+  specifier. Note that ``register void`` was previously accepted in all C
+  language modes but is now rejected (all of the other qualifiers and storage
+  class specifiers were previously rejected).
+
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index aba6b555ff28f3..c9cd81a48fbe51 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15002,6 +15002,12 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D,
   const DeclSpec &DS = D.getDeclSpec();
 
   // Verify C99 6.7.5.3p2: The only SCS allowed is 'register'.
+  // C2y 6.7.7.4p4: A parameter declaration shall not specify a void type,
+  // except for the special case of a single unnamed parameter of type void
+  // with no storage class specifier, no type qualifier, and no following
+  // ellipsis terminator.
+  // Clang applies the C2y rules for 'register void' in all C language modes,
+  // same as GCC, because it's questionable what that could possibly mean.
 
   // C++03 [dcl.stc]p2 also permits 'auto'.
   StorageClass SC = SC_None;
@@ -15010,10 +15016,16 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D,
     // In C++11, the 'register' storage class specifier is deprecated.
     // In C++17, it is not allowed, but we tolerate it as an extension.
     if (getLangOpts().CPlusPlus11) {
+      Diag(DS.getStorageClassSpecLoc(), getLangOpts().CPlusPlus17
+                                            ? diag::ext_register_storage_class
+                                            : diag::warn_deprecated_register)
+          << FixItHint::CreateRemoval(DS.getStorageClassSpecLoc());
+    } else if (!getLangOpts().CPlusPlus &&
+               DS.getTypeSpecType() == DeclSpec::TST_void) {
       Diag(DS.getStorageClassSpecLoc(),
-           getLangOpts().CPlusPlus17 ? diag::ext_register_storage_class
-                                     : diag::warn_deprecated_register)
-        << FixItHint::CreateRemoval(DS.getStorageClassSpecLoc());
+           diag::err_invalid_storage_class_in_func_decl)
+          << FixItHint::CreateRemoval(DS.getStorageClassSpecLoc());
+      D.getMutableDeclSpec().ClearStorageClassSpecs();
     }
   } else if (getLangOpts().CPlusPlus &&
              DS.getStorageClassSpec() == DeclSpec::SCS_auto) {
diff --git a/clang/test/C/C2y/n3344.c b/clang/test/C/C2y/n3344.c
new file mode 100644
index 00000000000000..bd3d440cb5d12a
--- /dev/null
+++ b/clang/test/C/C2y/n3344.c
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+// RUN: %clang_cc1 -verify -Wall -pedantic %s
+
+/* WG14 N3344: Yes
+ * Slay Some Earthly Demons VI
+ *
+ * A 'void' parameter cannot have any qualifiers, storage class specifiers, or
+ * be followed by an ellipsis.
+ *
+ * Note: Clang treats 'register void' as being a DR and rejects it in all
+ * language modes; there's no evidence that this will break users and it's not
+ * clear what the programmer intended if they wrote such code anyway. This
+ * matches GCC's behavior.
+ */
+
+void baz(volatile void);         // expected-error {{'void' as parameter must not have type qualifiers}}
+void bar(const void);            // expected-error {{'void' as parameter must not have type qualifiers}}
+void foo(register void);         // expected-error {{invalid storage class specifier in function declarator}}
+void quux(static void);          // expected-error {{invalid storage class specifier in function declarator}}
+void quobble(auto void);         // expected-error {{invalid storage class specifier in function declarator}}
+void quubble(extern void);       // expected-error {{invalid storage class specifier in function declarator}}
+// FIXME: it's odd that these aren't diagnosed as storage class specifiers.
+#if __STDC_VERSION__ >= 202311L
+void quibble(constexpr void);    // expected-error {{function parameter cannot be constexpr}}
+#endif
+void quabble(_Thread_local void); // expected-error {{'_Thread_local' is only allowed on variable declarations}}
+void bing(void, ...);            // expected-error {{'void' must be the first and only parameter if specified}}
+
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 8b677095cee182..e66424290e6d50 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -201,7 +201,7 @@ <h2 id="c2y">C2y implementation status</h2>
     <tr>
       <td>Slay Some Earthly Demons VI</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3344.pdf">N3344</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="unreleased" align="center">Clang 20</td>
     </tr>
     <tr>
       <td>Slay Some Earthly Demons VII</td>

>From 33ec9823cd4869ff5817b0ee83cd8a78f06e62e4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 7 Nov 2024 07:35:33 -0800
Subject: [PATCH 13/20] ValueTracking: Do not return nullptr from
 getUnderlyingObject (#115258)

Fixup for 29a5c054e6d56a912ed5ba3f84e8ca631872db8b. The failure case
should return the last value found.
---
 llvm/lib/Analysis/ValueTracking.cpp                |  7 ++++---
 ...vector-of-pointers-getunderlyingobject-crash.ll | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 52d3468fbe9cae..f178e3a8acc259 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6723,9 +6723,10 @@ static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
 const Value *llvm::getUnderlyingObject(const Value *V, unsigned MaxLookup) {
   for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
     if (auto *GEP = dyn_cast<GEPOperator>(V)) {
-      V = GEP->getPointerOperand();
-      if (!V->getType()->isPointerTy()) // Only handle scalar pointer base.
-        return nullptr;
+      const Value *PtrOp = GEP->getPointerOperand();
+      if (!PtrOp->getType()->isPointerTy()) // Only handle scalar pointer base.
+        return V;
+      V = PtrOp;
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       Value *NewV = cast<Operator>(V)->getOperand(0);
diff --git a/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll b/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
new file mode 100644
index 00000000000000..2ccb1efd85e33b
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=function-attrs < %s | FileCheck %s
+
+define double @getUnderlyingObject_vector_ptr(<4 x i1> %arg0, <4 x i1> %arg1) {
+; CHECK-LABEL: define double @getUnderlyingObject_vector_ptr(
+; CHECK-SAME: <4 x i1> [[ARG0:%.*]], <4 x i1> [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> getelementptr inbounds (i8, <4 x ptr> zeroinitializer, <4 x i64> splat (i64 8)), i32 0, <4 x i1> [[ARG0]], <4 x double> zeroinitializer)
+; CHECK-NEXT:    [[REDUCE_FADD:%.*]] = tail call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[GATHER]])
+; CHECK-NEXT:    ret double [[REDUCE_FADD]]
+;
+  %gather = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> getelementptr inbounds (i8, <4 x ptr> zeroinitializer, <4 x i64> <i64 8, i64 8, i64 8, i64 8>), i32 0, <4 x i1> %arg0, <4 x double> zeroinitializer)
+  %reduce.fadd = tail call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> %gather)
+  ret double %reduce.fadd
+}

>From 1275a11585cdd43941a8a463687a7cf43348f76c Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Thu, 7 Nov 2024 10:41:51 -0500
Subject: [PATCH 14/20] Fix failing test bot

This addresses the issue found by:
https://lab.llvm.org/buildbot/#/builders/144/builds/11070
---
 clang/test/C/C2y/n3344.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/test/C/C2y/n3344.c b/clang/test/C/C2y/n3344.c
index bd3d440cb5d12a..b9c291d9f1dfa1 100644
--- a/clang/test/C/C2y/n3344.c
+++ b/clang/test/C/C2y/n3344.c
@@ -23,6 +23,8 @@ void quubble(extern void);       // expected-error {{invalid storage class speci
 #if __STDC_VERSION__ >= 202311L
 void quibble(constexpr void);    // expected-error {{function parameter cannot be constexpr}}
 #endif
+#if __STDC_VERSION__ >= 201112L
 void quabble(_Thread_local void); // expected-error {{'_Thread_local' is only allowed on variable declarations}}
+#endif
 void bing(void, ...);            // expected-error {{'void' must be the first and only parameter if specified}}
 

>From 55054b37711f54754ccbca132ce8c2c1c6e1e704 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Thu, 7 Nov 2024 16:46:56 +0100
Subject: [PATCH 15/20] [OpenMP] Add missing SExt attributes on i32 args.
 (#115242)

__kmpc_omp_taskwait_deps_51 arguments fixed.
---
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def | 3 ++-
 llvm/test/Transforms/OpenMP/add_attributes.ll  | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d8f3c8fa06b747..6f26f853eca032 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -946,7 +946,8 @@ __OMP_RTL_ATTRS(__kmpc_proxy_task_completed_ooo, DefaultAttrs, AttributeSet(),
 __OMP_RTL_ATTRS(__kmpc_omp_wait_deps, BarrierAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt, ReadOnlyPtrAttrs, SExt))
 __OMP_RTL_ATTRS(__kmpc_omp_taskwait_deps_51, BarrierAttrs, AttributeSet(),
-                ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt, ReadOnlyPtrAttrs))
+                ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt, ReadOnlyPtrAttrs, SExt,
+                           AttributeSet(), SExt))
 __OMP_RTL_ATTRS(__kmpc_cancellationpoint, DefaultAttrs, SExt,
                 ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt))
 
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index fefb1900afae7e..ce28643267a6a5 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -589,6 +589,8 @@ declare i32 @__kmpc_omp_task_with_deps(ptr, i32, ptr, i32, ptr, i32, ptr)
 
 declare void @__kmpc_omp_wait_deps(ptr, i32, i32, ptr, i32, ptr)
 
+declare void @__kmpc_omp_taskwait_deps_51(ptr, i32, i32, ptr, i32, ptr, i32)
+
 declare i32 @__kmpc_cancellationpoint(ptr, i32, i32)
 
 declare void @__kmpc_push_num_teams(ptr, i32, i32, i32)
@@ -2465,6 +2467,9 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; EXT: ; Function Attrs: convergent nounwind
 ; EXT-NEXT: declare void @__kmpc_omp_wait_deps(ptr, i32 signext, i32 signext, ptr, i32 signext, ptr)
 
+; EXT: ; Function Attrs: convergent nounwind
+; EXT-NEXT: declare void @__kmpc_omp_taskwait_deps_51(ptr, i32 signext, i32 signext, ptr, i32 signext, ptr, i32 signext)
+
 ; EXT: ; Function Attrs: nounwind
 ; EXT-NEXT: declare signext i32 @__kmpc_cancellationpoint(ptr, i32 signext, i32 signext)
 

>From 5230fd17a2e1176dee7a9618bdc870b9f037fbc0 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian at ca.ibm.com>
Date: Thu, 7 Nov 2024 10:53:11 -0500
Subject: [PATCH 16/20] [llvm-objdump] Implement decoding auxiliary header for
 xcoff with llvm-objdump --private-headers (#105682)

Implement decoding auxiliary header of XCOFF object file with
llvm-objdump --private-headers
---
 .../XCOFF/private-header-auxiliary.test       | 103 +++++++++
 llvm/tools/llvm-objdump/XCOFFDump.cpp         | 204 +++++++++++++++++-
 2 files changed, 303 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/XCOFF/private-header-auxiliary.test

diff --git a/llvm/test/tools/llvm-objdump/XCOFF/private-header-auxiliary.test b/llvm/test/tools/llvm-objdump/XCOFF/private-header-auxiliary.test
new file mode 100644
index 00000000000000..a2aca3710157f5
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/XCOFF/private-header-auxiliary.test
@@ -0,0 +1,103 @@
+## Test that `llvm-objdump --private-headers` prints out the auxiliary header of an XCOFF object file.
+# RUN: yaml2obj %s -o %t1
+# RUN: llvm-objdump --private-headers %t1 | \
+# RUN:   FileCheck %s --check-prefixes=CHECK32,COMMON --match-full-lines --strict-whitespace
+# RUN: yaml2obj %s -DMAGIC=0x1F7 -DFLAG64=0x2 -o %t2
+# RUN: llvm-objdump --private-headers %t2 | \
+# RUN:   FileCheck %s --check-prefixes=CHECK64,COMMON --match-full-lines --strict-whitespace
+
+--- !XCOFF
+FileHeader:
+  MagicNumber: [[MAGIC=0x1DF]]
+AuxiliaryHeader:
+  Magic:                 0x10B
+  Version:               0x1
+  TextSectionSize:       0x8
+  DataSectionSize:       0x9
+  BssSectionSize:        0x10
+  EntryPointAddr:        0x1111
+  TextStartAddr:         0x2222
+  DataStartAddr:         0x3333
+  TOCAnchorAddr:         0x4444
+  SecNumOfEntryPoint:    1
+  SecNumOfText:          2
+  SecNumOfData:          3
+  SecNumOfTOC:           4
+  SecNumOfLoader:        5
+  SecNumOfBSS:           6
+  MaxAlignOfText:        0x7
+  MaxAlignOfData:        0x3
+  ModuleType:            0x1
+  TextPageSize:          0x1
+  DataPageSize:          0x1
+  StackPageSize:         0x1
+  SecNumOfTData:         7
+  SecNumOfTBSS:          8
+  FlagAndTDataAlignment: 0x1
+  Flag:                  [[FLAG64=<none>]]
+Sections:
+  - Flags:       [ STYP_TEXT ]
+    SectionData: "1232"
+  - Flags:       [ STYP_DATA ]
+    SectionData: "5678"
+  - Flags:       [ STYP_BSS ]
+    SectionData: "9101"
+  - Flags:       [ STYP_TDATA ]
+    SectionData: "1112"
+  - Flags:       [ STYP_TBSS ]
+    SectionData: "1314"
+
+#       COMMON:---Auxiliary Header:
+#  COMMON-NEXT:Magic:                              0x10b
+#  COMMON-NEXT:Version:                            0x1
+# CHECK32-NEXT:Size of .text section:              0x8
+# CHECK32-NEXT:Size of .data section:              0x9
+# CHECK32-NEXT:Size of .bss section:               0x10
+# CHECK32-NEXT:Entry point address:                0x1111
+# CHECK64-NEXT:Reserved for debugger:              0x0
+#  COMMON-NEXT:.text section start address:        0x2222
+#  COMMON-NEXT:.data section start address:        0x3333
+#  COMMON-NEXT:TOC anchor address:                 0x4444
+#  COMMON-NEXT:Section number of entryPoint:       1
+#  COMMON-NEXT:Section number of .text:            2
+#  COMMON-NEXT:Section number of .data:            3
+#  COMMON-NEXT:Section number of TOC:              4
+#  COMMON-NEXT:Section number of loader data:      5
+#  COMMON-NEXT:Section number of .bss:             6
+#  COMMON-NEXT:Maxium alignment of .text:          0x7
+#  COMMON-NEXT:Maxium alignment of .data:          0x3
+#  COMMON-NEXT:Module type:                        0x0
+#  COMMON-NEXT:CPU type of objects:                0x1
+# CHECK32-NEXT:Maximum stack size:                 0x0
+# CHECK32-NEXT:Maximum data size:                  0x0
+# CHECK32-NEXT:Reserved for debugger:              0x0
+#  COMMON-NEXT:Text page size:                     0x1
+#  COMMON-NEXT:Data page size:                     0x1
+#  COMMON-NEXT:Stack page size:                    0x1
+#  COMMON-NEXT:Flag:                               0x0
+#  COMMON-NEXT:Alignment of thread-local storage:  0x1
+# CHECK64-NEXT:Size of .text section:              0x8
+# CHECK64-NEXT:Size of .data section:              0x9
+# CHECK64-NEXT:Size of .bss section:               0x10
+# CHECK64-NEXT:Entry point address:                0x1111
+# CHECK64-NEXT:Maximum stack size:                 0x0
+# CHECK64-NEXT:Maximum data size:                  0x0
+#  COMMON-NEXT:Section number for .tdata:          7
+#  COMMON-NEXT:Section number for .tbss:           8
+# CHECK64-NEXT:Additional flags 64-bit XCOFF:      0x2
+
+## Test how llvm-objdump behaves when the auxiliary header of the XCOFF object file contains a partial field.
+# RUN: cp %t1 %t1_err1
+# RUN: %python -c "with open(r'%t1_err1', 'r+b') as input: input.seek(17); input.write(b'\x45'); input.seek(4); input.write(b'\x00')"
+# RUN: llvm-objdump --private-headers %t1_err1 2>&1 | FileCheck %s --check-prefix=WARN1 --match-full-lines --strict-whitespace
+
+#      WARN1:{{.*}}: only partial field for Section number for .tdata: at offset (68)
+# WARN1-NEXT:Raw data (00)
+
+## Test how llvm-objdump behaves when the auxiliary header of the XCOFF object file contains extra data.
+# RUN: cp %t1 %t1_extra
+# RUN: %python -c "with open(r'%t1_extra', 'r+b') as input: input.seek(17); input.write(b'\x4f'); input.seek(4); input.write(b'\x00')"
+# RUN: llvm-objdump --private-headers %t1_extra 2>&1 | FileCheck %s --check-prefix=EXTRA --match-full-lines --strict-whitespace
+
+#     EXTRA:Extra raw data (00000000 000000)
+# EXTRA-NOT:{{.}}
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.cpp b/llvm/tools/llvm-objdump/XCOFFDump.cpp
index 524bf07c372a9b..2e8a8c82b89079 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.cpp
+++ b/llvm/tools/llvm-objdump/XCOFFDump.cpp
@@ -32,6 +32,7 @@ using namespace llvm::support;
 
 namespace {
 class XCOFFDumper : public objdump::Dumper {
+  enum PrintStyle { Hex, Number };
   const XCOFFObjectFile &Obj;
   unsigned Width;
 
@@ -41,14 +42,32 @@ class XCOFFDumper : public objdump::Dumper {
 private:
   void printPrivateHeaders() override;
   void printFileHeader();
-  FormattedString formatName(StringRef Name);
+  void printAuxiliaryHeader();
+  void printAuxiliaryHeader(const XCOFFAuxiliaryHeader32 *AuxHeader);
+  void printAuxiliaryHeader(const XCOFFAuxiliaryHeader64 *AuxHeader);
+  template <typename AuxHeaderMemberType, typename XCOFFAuxiliaryHeader>
+  void printAuxMemberHelper(PrintStyle Style, const char *MemberName,
+                            const AuxHeaderMemberType &Member,
+                            const XCOFFAuxiliaryHeader *AuxHeader,
+                            uint16_t AuxSize, uint16_t &PartialFieldOffset,
+                            const char *&PartialFieldName);
+  template <typename XCOFFAuxiliaryHeader>
+  void checkAndPrintAuxHeaderParseError(const char *PartialFieldName,
+                                        uint16_t PartialFieldOffset,
+                                        uint16_t AuxSize,
+                                        XCOFFAuxiliaryHeader &AuxHeader);
+
+  void printBinary(StringRef Name, ArrayRef<uint8_t> Data);
   void printHex(StringRef Name, uint64_t Value);
   void printNumber(StringRef Name, uint64_t Value);
+  FormattedString formatName(StringRef Name);
   void printStrHex(StringRef Name, StringRef Str, uint64_t Value);
-  void setWidth(unsigned W) { Width = W; };
 };
 
-void XCOFFDumper::printPrivateHeaders() { printFileHeader(); }
+void XCOFFDumper::printPrivateHeaders() {
+  printFileHeader();
+  printAuxiliaryHeader();
+}
 
 FormattedString XCOFFDumper::formatName(StringRef Name) {
   return FormattedString(Name, Width, FormattedString::JustifyLeft);
@@ -67,8 +86,185 @@ void XCOFFDumper::printStrHex(StringRef Name, StringRef Str, uint64_t Value) {
          << ")\n";
 }
 
+void XCOFFDumper::printBinary(StringRef Name, ArrayRef<uint8_t> Data) {
+  unsigned OrgWidth = Width;
+  Width = 0;
+  outs() << formatName(Name) << " (" << format_bytes(Data) << ")\n";
+  Width = OrgWidth;
+}
+
+void XCOFFDumper::printAuxiliaryHeader() {
+  Width = 36;
+  if (Obj.is64Bit())
+    printAuxiliaryHeader(Obj.auxiliaryHeader64());
+  else
+    printAuxiliaryHeader(Obj.auxiliaryHeader32());
+}
+
+template <typename AuxHeaderMemberType, typename XCOFFAuxiliaryHeader>
+void XCOFFDumper::printAuxMemberHelper(PrintStyle Style, const char *MemberName,
+                                       const AuxHeaderMemberType &Member,
+                                       const XCOFFAuxiliaryHeader *AuxHeader,
+                                       uint16_t AuxSize,
+                                       uint16_t &PartialFieldOffset,
+                                       const char *&PartialFieldName) {
+  ptrdiff_t Offset = reinterpret_cast<const char *>(&Member) -
+                     reinterpret_cast<const char *>(AuxHeader);
+  if (Offset + sizeof(Member) <= AuxSize) {
+    if (Style == Hex)
+      printHex(MemberName, Member);
+    else
+      printNumber(MemberName, Member);
+  } else if (Offset < AuxSize) {
+    PartialFieldOffset = Offset;
+    PartialFieldName = MemberName;
+  }
+}
+
+template <typename XCOFFAuxiliaryHeader>
+void XCOFFDumper::checkAndPrintAuxHeaderParseError(
+    const char *PartialFieldName, uint16_t PartialFieldOffset, uint16_t AuxSize,
+    XCOFFAuxiliaryHeader &AuxHeader) {
+  if (PartialFieldOffset < AuxSize) {
+    std::string Buf;
+    raw_string_ostream OS(Buf);
+    OS.flush();
+    OS << FormattedString("Raw data", 0, FormattedString::JustifyLeft) << " ("
+       << format_bytes(
+              ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&AuxHeader) +
+                                    PartialFieldOffset,
+                                AuxSize - PartialFieldOffset))
+       << ")\n";
+    reportUniqueWarning(Twine("only partial field for ") + PartialFieldName +
+                        " at offset (" + Twine(PartialFieldOffset) + ")\n" +
+                        OS.str());
+  } else if (sizeof(AuxHeader) < AuxSize) {
+    printBinary(
+        "Extra raw data",
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&AuxHeader) +
+                              sizeof(AuxHeader),
+                          AuxSize - sizeof(AuxHeader)));
+  }
+}
+
+void XCOFFDumper::printAuxiliaryHeader(
+    const XCOFFAuxiliaryHeader32 *AuxHeader) {
+  if (AuxHeader == nullptr)
+    return;
+  outs() << "\n---Auxiliary Header:\n";
+  uint16_t AuxSize = Obj.getOptionalHeaderSize();
+  uint16_t PartialFieldOffset = AuxSize;
+  const char *PartialFieldName = nullptr;
+
+  auto PrintAuxMember = [&](PrintStyle Style, const char *MemberName,
+                            auto &Member) {
+    printAuxMemberHelper(Style, MemberName, Member, AuxHeader, AuxSize,
+                         PartialFieldOffset, PartialFieldName);
+  };
+
+  PrintAuxMember(Hex, "Magic:", AuxHeader->AuxMagic);
+  PrintAuxMember(Hex, "Version:", AuxHeader->Version);
+  PrintAuxMember(Hex, "Size of .text section:", AuxHeader->TextSize);
+  PrintAuxMember(Hex, "Size of .data section:", AuxHeader->InitDataSize);
+  PrintAuxMember(Hex, "Size of .bss section:", AuxHeader->BssDataSize);
+  PrintAuxMember(Hex, "Entry point address:", AuxHeader->EntryPointAddr);
+  PrintAuxMember(Hex, ".text section start address:", AuxHeader->TextStartAddr);
+  PrintAuxMember(Hex, ".data section start address:", AuxHeader->DataStartAddr);
+  PrintAuxMember(Hex, "TOC anchor address:", AuxHeader->TOCAnchorAddr);
+  PrintAuxMember(
+      Number, "Section number of entryPoint:", AuxHeader->SecNumOfEntryPoint);
+  PrintAuxMember(Number, "Section number of .text:", AuxHeader->SecNumOfText);
+  PrintAuxMember(Number, "Section number of .data:", AuxHeader->SecNumOfData);
+  PrintAuxMember(Number, "Section number of TOC:", AuxHeader->SecNumOfTOC);
+  PrintAuxMember(Number,
+                 "Section number of loader data:", AuxHeader->SecNumOfLoader);
+  PrintAuxMember(Number, "Section number of .bss:", AuxHeader->SecNumOfBSS);
+  PrintAuxMember(Hex, "Maxium alignment of .text:", AuxHeader->MaxAlignOfText);
+  PrintAuxMember(Hex, "Maxium alignment of .data:", AuxHeader->MaxAlignOfData);
+  PrintAuxMember(Hex, "Module type:", AuxHeader->ModuleType);
+  PrintAuxMember(Hex, "CPU type of objects:", AuxHeader->CpuFlag);
+  PrintAuxMember(Hex, "Maximum stack size:", AuxHeader->MaxStackSize);
+  PrintAuxMember(Hex, "Maximum data size:", AuxHeader->MaxDataSize);
+  PrintAuxMember(Hex, "Reserved for debugger:", AuxHeader->ReservedForDebugger);
+  PrintAuxMember(Hex, "Text page size:", AuxHeader->TextPageSize);
+  PrintAuxMember(Hex, "Data page size:", AuxHeader->DataPageSize);
+  PrintAuxMember(Hex, "Stack page size:", AuxHeader->StackPageSize);
+  if (offsetof(XCOFFAuxiliaryHeader32, FlagAndTDataAlignment) +
+          sizeof(XCOFFAuxiliaryHeader32::FlagAndTDataAlignment) <=
+      AuxSize) {
+    printHex("Flag:", AuxHeader->getFlag());
+    printHex("Alignment of thread-local storage:",
+             AuxHeader->getTDataAlignment());
+  }
+
+  PrintAuxMember(Number,
+                 "Section number for .tdata:", AuxHeader->SecNumOfTData);
+  PrintAuxMember(Number, "Section number for .tbss:", AuxHeader->SecNumOfTBSS);
+
+  checkAndPrintAuxHeaderParseError(PartialFieldName, PartialFieldOffset,
+                                   AuxSize, *AuxHeader);
+}
+
+void XCOFFDumper::printAuxiliaryHeader(
+    const XCOFFAuxiliaryHeader64 *AuxHeader) {
+  if (AuxHeader == nullptr)
+    return;
+  uint16_t AuxSize = Obj.getOptionalHeaderSize();
+  outs() << "\n---Auxiliary Header:\n";
+  uint16_t PartialFieldOffset = AuxSize;
+  const char *PartialFieldName = nullptr;
+
+  auto PrintAuxMember = [&](PrintStyle Style, const char *MemberName,
+                            auto &Member) {
+    printAuxMemberHelper(Style, MemberName, Member, AuxHeader, AuxSize,
+                         PartialFieldOffset, PartialFieldName);
+  };
+
+  PrintAuxMember(Hex, "Magic:", AuxHeader->AuxMagic);
+  PrintAuxMember(Hex, "Version:", AuxHeader->Version);
+  PrintAuxMember(Hex, "Reserved for debugger:", AuxHeader->ReservedForDebugger);
+  PrintAuxMember(Hex, ".text section start address:", AuxHeader->TextStartAddr);
+  PrintAuxMember(Hex, ".data section start address:", AuxHeader->DataStartAddr);
+  PrintAuxMember(Hex, "TOC anchor address:", AuxHeader->TOCAnchorAddr);
+  PrintAuxMember(
+      Number, "Section number of entryPoint:", AuxHeader->SecNumOfEntryPoint);
+  PrintAuxMember(Number, "Section number of .text:", AuxHeader->SecNumOfText);
+  PrintAuxMember(Number, "Section number of .data:", AuxHeader->SecNumOfData);
+  PrintAuxMember(Number, "Section number of TOC:", AuxHeader->SecNumOfTOC);
+  PrintAuxMember(Number,
+                 "Section number of loader data:", AuxHeader->SecNumOfLoader);
+  PrintAuxMember(Number, "Section number of .bss:", AuxHeader->SecNumOfBSS);
+  PrintAuxMember(Hex, "Maxium alignment of .text:", AuxHeader->MaxAlignOfText);
+  PrintAuxMember(Hex, "Maxium alignment of .data:", AuxHeader->MaxAlignOfData);
+  PrintAuxMember(Hex, "Module type:", AuxHeader->ModuleType);
+  PrintAuxMember(Hex, "CPU type of objects:", AuxHeader->CpuFlag);
+  PrintAuxMember(Hex, "Text page size:", AuxHeader->TextPageSize);
+  PrintAuxMember(Hex, "Data page size:", AuxHeader->DataPageSize);
+  PrintAuxMember(Hex, "Stack page size:", AuxHeader->StackPageSize);
+  if (offsetof(XCOFFAuxiliaryHeader64, FlagAndTDataAlignment) +
+          sizeof(XCOFFAuxiliaryHeader64::FlagAndTDataAlignment) <=
+      AuxSize) {
+    printHex("Flag:", AuxHeader->getFlag());
+    printHex("Alignment of thread-local storage:",
+             AuxHeader->getTDataAlignment());
+  }
+  PrintAuxMember(Hex, "Size of .text section:", AuxHeader->TextSize);
+  PrintAuxMember(Hex, "Size of .data section:", AuxHeader->InitDataSize);
+  PrintAuxMember(Hex, "Size of .bss section:", AuxHeader->BssDataSize);
+  PrintAuxMember(Hex, "Entry point address:", AuxHeader->EntryPointAddr);
+  PrintAuxMember(Hex, "Maximum stack size:", AuxHeader->MaxStackSize);
+  PrintAuxMember(Hex, "Maximum data size:", AuxHeader->MaxDataSize);
+  PrintAuxMember(Number,
+                 "Section number for .tdata:", AuxHeader->SecNumOfTData);
+  PrintAuxMember(Number, "Section number for .tbss:", AuxHeader->SecNumOfTBSS);
+  PrintAuxMember(Hex, "Additional flags 64-bit XCOFF:", AuxHeader->XCOFF64Flag);
+
+  checkAndPrintAuxHeaderParseError(PartialFieldName, PartialFieldOffset,
+                                   AuxSize, *AuxHeader);
+}
+
 void XCOFFDumper::printFileHeader() {
-  setWidth(20);
+  Width = 20;
   outs() << "\n---File Header:\n";
   printHex("Magic:", Obj.getMagic());
   printNumber("NumberOfSections:", Obj.getNumberOfSections());

>From 5b92aabf703475d413b852fcd1abb5b05a50f36c Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Thu, 7 Nov 2024 10:59:16 -0500
Subject: [PATCH 17/20] Add release note for WG14 N3298

---
 clang/docs/ReleaseNotes.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3e7d8e15110f9d..703422c9a6f0b3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -274,6 +274,13 @@ C Language Changes
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
 
+- Updated conformance for `N3298 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3298.htm>`_
+  which adds the ``i`` and ``j`` suffixes for the creation of a ``_Complex``
+  constant value. Clang has always supported these suffixes as a GNU extension,
+  so ``-Wgnu-imaginary-constant`` no longer has effect in C modes, as this is
+  not a C2y extension in C. ``-Wgnu-imaginary-constant`` still applies in C++
+  modes.
+
 - Clang updated conformance for `N3370 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3370.htm>`_
   case range expressions. This feature was previously supported by Clang as a
   GNU extension, so ``-Wgnu-case-range`` no longer has effect in C modes, as

>From cddd07e675814989d896e94c356727c39a67b6ff Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 7 Nov 2024 15:26:37 +0000
Subject: [PATCH 18/20] [clang][x86] Add constexpr support for
 _mm256_set_pd/_mm256_set_ps/_mm256_set1_pd/_mm256_set1_ps/_mm256_setr_pd/_mm256_setr_ps

---
 clang/lib/Headers/avxintrin.h         | 12 ++++++------
 clang/test/CodeGen/X86/avx-builtins.c |  6 ++++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index bb43b292be01f6..8e497a98234994 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -3706,7 +3706,7 @@ _mm256_undefined_si256(void)
 ///    A double-precision floating-point value used to initialize bits [63:0]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_pd(double __a, double __b, double __c, double __d)
 {
   return __extension__ (__m256d){ __d, __c, __b, __a };
@@ -3745,7 +3745,7 @@ _mm256_set_pd(double __a, double __b, double __c, double __d)
 ///    A single-precision floating-point value used to initialize bits [31:0]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_ps(float __a, float __b, float __c, float __d,
               float __e, float __f, float __g, float __h)
 {
@@ -3972,7 +3972,7 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 ///    A double-precision floating-point value used to initialize bits [255:192]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_pd(double __a, double __b, double __c, double __d)
 {
   return _mm256_set_pd(__d, __c, __b, __a);
@@ -4012,7 +4012,7 @@ _mm256_setr_pd(double __a, double __b, double __c, double __d)
 ///    A single-precision floating-point value used to initialize bits [255:224]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_ps(float __a, float __b, float __c, float __d,
                float __e, float __f, float __g, float __h)
 {
@@ -4229,7 +4229,7 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 ///    A double-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_pd(double __w)
 {
   return _mm256_set_pd(__w, __w, __w, __w);
@@ -4248,7 +4248,7 @@ _mm256_set1_pd(double __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_ps(float __w)
 {
   return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 9ed6d47e8808a9..e390d861832345 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1496,6 +1496,7 @@ __m256d test_mm256_set_pd(double A0, double A1, double A2, double A3) {
   // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_set_pd(A0, A1, A2, A3);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_set_pd(-100.0, +90.0, -50.0, +1.0), +1.0, -50.0, +90.0, -100.0));
 
 __m256 test_mm256_set_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
   // CHECK-LABEL: test_mm256_set_ps
@@ -1509,6 +1510,7 @@ __m256 test_mm256_set_ps(float A0, float A1, float A2, float A3, float A4, float
   // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_set_ps(A0, A1, A2, A3, A4, A5, A6, A7);
 }
+TEST_CONSTEXPR(match_m256(_mm256_set_ps(-1.0f, +2.0f, -3.0f, +4.0f, -5.0f, +6.0f, -7.0f, +8.0f), +8.0f, -7.0f, +6.0f, -5.0f, +4.0f, -3.0f, +2.0f, -1.0f));
 
 __m256i test_mm256_set1_epi8(char A) {
   // CHECK-LABEL: test_mm256_set1_epi8
@@ -1598,6 +1600,7 @@ __m256d test_mm256_set1_pd(double A) {
   // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_set1_pd(A);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_set1_pd(+42.0), +42.0, +42.0, +42.0, +42.0));
 
 __m256 test_mm256_set1_ps(float A) {
   // CHECK-LABEL: test_mm256_set1_ps
@@ -1611,6 +1614,7 @@ __m256 test_mm256_set1_ps(float A) {
   // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_set1_ps(A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_set1_ps(-101.0f), -101.0f, -101.0f, -101.0f, -101.0f, -101.0f, -101.0f, -101.0f, -101.0f));
 
 __m256i test_mm256_setr_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
                              char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
@@ -1722,6 +1726,7 @@ __m256d test_mm256_setr_pd(double A0, double A1, double A2, double A3) {
   // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_setr_pd(A0, A1, A2, A3);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_setr_pd(-100.0, +90.0, -50.0, +1.0), -100.0, +90.0, -50.0, +1.0));
 
 __m256 test_mm256_setr_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
   // CHECK-LABEL: test_mm256_setr_ps
@@ -1735,6 +1740,7 @@ __m256 test_mm256_setr_ps(float A0, float A1, float A2, float A3, float A4, floa
   // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_setr_ps(A0, A1, A2, A3, A4, A5, A6, A7);
 }
+TEST_CONSTEXPR(match_m256(_mm256_setr_ps(-1.0f, +2.0f, -3.0f, +4.0f, -5.0f, +6.0f, -7.0f, +8.0f), -1.0f, +2.0f, -3.0f, +4.0f, -5.0f, +6.0f, -7.0f, +8.0f));
 
 __m256d test_mm256_setzero_pd(void) {
   // CHECK-LABEL: test_mm256_setzero_pd

>From 6531fd98645a21473205792bfa16c95313174dd8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 7 Nov 2024 15:29:20 +0000
Subject: [PATCH 19/20] [clang][x86] avx-builtins.c - reorder tests to keep
 alpha sorting order. NFC.

---
 clang/test/CodeGen/X86/avx-builtins.c | 39 +++++++++++++--------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index e390d861832345..cf7c61d871502b 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -938,6 +938,24 @@ __m256d test_mm256_cvtps_pd(__m128 A) {
   return _mm256_cvtps_pd(A);
 }
 
+double test_mm256_cvtsd_f64(__m256d __a) {
+  // CHECK-LABEL: test_mm256_cvtsd_f64
+  // CHECK: extractelement <4 x double> %{{.*}}, i32 0
+  return _mm256_cvtsd_f64(__a);
+}
+
+int test_mm256_cvtsi256_si32(__m256i __a) {
+  // CHECK-LABEL: test_mm256_cvtsi256_si32
+  // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
+  return _mm256_cvtsi256_si32(__a);
+}
+
+float test_mm256_cvtss_f32(__m256 __a) {
+  // CHECK-LABEL: test_mm256_cvtss_f32
+  // CHECK: extractelement <8 x float> %{{.*}}, i32 0
+  return _mm256_cvtss_f32(__a);
+}
+
 __m128i test_mm256_cvttpd_epi32(__m256d A) {
   // CHECK-LABEL: test_mm256_cvttpd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %{{.*}})
@@ -2086,24 +2104,3 @@ __m256i test_mm256_zextsi128_si256(__m128i A) {
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_zextsi128_si256(A);
 }
-
-double test_mm256_cvtsd_f64(__m256d __a)
-{
-  // CHECK-LABEL: test_mm256_cvtsd_f64
-  // CHECK: extractelement <4 x double> %{{.*}}, i32 0
-  return _mm256_cvtsd_f64(__a);
-}
-
-int test_mm256_cvtsi256_si32(__m256i __a)
-{
-  // CHECK-LABEL: test_mm256_cvtsi256_si32
-  // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
-  return _mm256_cvtsi256_si32(__a);
-}
-
-float test_mm256_cvtss_f32(__m256 __a)
-{
-  // CHECK-LABEL: test_mm256_cvtss_f32
-  // CHECK: extractelement <8 x float> %{{.*}}, i32 0
-  return _mm256_cvtss_f32(__a);
-}

>From f227cd7076f27917da400f307635ee2376267b95 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng at sifive.com>
Date: Fri, 8 Nov 2024 00:06:48 +0800
Subject: [PATCH 20/20] =?UTF-8?q?[compiler-rt][RISCV]=20Avoid=20using=20?=
 =?UTF-8?q?=5F=5Finit=5Friscv=5Ffeature=5Fbits=20as=20a=20direc=E2=80=A6?=
 =?UTF-8?q?=20(#115316)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…t constructor

`__init_riscv_feature_bits` takes an argument that can be
platform-specific, potentially pointing to the VDSO address of the
hwprobe system call for Linux. However, marking it as a constructor does
not guarantee that 0/NULL will always be passed to this argument, which
may result in treating an uninitialized or garbage value as a pointer to
hwprobe, leading to a crash.

The simplest solution is to introduce a small constructor function to
ensure that the platform-specific argument is set to 0/NULL.
---
 compiler-rt/lib/builtins/cpu_model/riscv.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
index 052124fdde447e..74534896057ef5 100644
--- a/compiler-rt/lib/builtins/cpu_model/riscv.c
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -335,7 +335,8 @@ static void initRISCVFeature(struct riscv_hwprobe Hwprobes[]) {
 
 static int FeaturesBitCached = 0;
 
-void __init_riscv_feature_bits(void *) CONSTRUCTOR_ATTRIBUTE;
+void __init_riscv_feature_bits(void *);
+static void __init_riscv_feature_bits_ctor(void) CONSTRUCTOR_ATTRIBUTE;
 
 // A constructor function that sets __riscv_feature_bits, and
 // __riscv_vendor_feature_bits to the right values.  This needs to run
@@ -343,10 +344,14 @@ void __init_riscv_feature_bits(void *) CONSTRUCTOR_ATTRIBUTE;
 // run before constructors without the priority set.  However, it still runs
 // after ifunc initializers and needs to be called explicitly there.
 
+static void CONSTRUCTOR_ATTRIBUTE __init_riscv_feature_bits_ctor(void) {
+  __init_riscv_feature_bits(0);
+}
+
 // PlatformArgs allows the platform to provide pre-computed data and access it
 // without extra effort. For example, Linux could pass the vDSO object to avoid
 // an extra system call.
-void CONSTRUCTOR_ATTRIBUTE __init_riscv_feature_bits(void *PlatformArgs) {
+void __init_riscv_feature_bits(void *PlatformArgs) {
 
   if (FeaturesBitCached)
     return;



More information about the cfe-commits mailing list