[clang] [Clang][RISCV] Handle RVV tuple types correctly as OutputOperands for inline asm (PR #67109)

Fri Sep 22 02:45:42 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-clang

<details>
<summary>Changes</summary>

This PR is based on #67018. This PR fixes compilation issue for RVV tuple types as InputOperands for inline asm.

---

Currently the compiler generates https://godbolt.org/z/djebPfqxf for tuple type as inline asm inputs and cannot be code generated successfully https://godbolt.org/z/na7T19Krc. This PR fixes Clang by generating https://godbolt.org/z/MsovoxbY9 instead, which can be successfully handled by the back-end.

A follow-up PR will handle interactions of RVV tuple type InputOperands and OutputOperands correctly.

---
Full diff: https://github.com/llvm/llvm-project/pull/67109.diff


2 Files Affected:

- (modified) clang/lib/CodeGen/CGStmt.cpp (+101-6) 
- (added) clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c (+54) 


``````````diff

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 6674aa2409a5947..4a2bdde56c5704e 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/PrettyStackTrace.h"
@@ -29,10 +30,13 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Assumptions.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include <optional>
 
@@ -2392,6 +2396,26 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
         Tmp = Builder.CreateZExtOrTrunc(Tmp, TruncTy);
       } else if (TruncTy->isVectorTy()) {
         Tmp = Builder.CreateBitCast(Tmp, TruncTy);
+      } else if (TruncTy->isStructTy() && ResultRegQualTys[i]->isRVVType()) {
+        auto *STy = cast<llvm::StructType>(TruncTy);
+        auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+        assert(STy->containsHomogeneousScalableVectorTypes() &&
+               "Must be dealing with RVV tuple type");
+
+        unsigned MinElts = VTy->getElementCount().getKnownMinValue();
+        llvm::Value *StructValue = llvm::PoisonValue::get(STy);
+
+        for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+             Idx != TupleSize; ++Idx) {
+          llvm::Value *IdxValue =
+              llvm::ConstantInt::get(CGM.Int64Ty, Idx * MinElts);
+          llvm::Value *SubVec = Builder.CreateExtractVector(VTy, Tmp, IdxValue);
+
+          StructValue = Builder.CreateInsertValue(StructValue, SubVec, Idx);
+        }
+
+        Tmp = StructValue;
       }
     }
 
@@ -2399,7 +2423,13 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
     // ResultTypeRequiresCast elements correspond to the first
     // ResultTypeRequiresCast.size() elements of RegResults.
     if ((i < ResultTypeRequiresCast.size()) && ResultTypeRequiresCast[i]) {
-      unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
+      unsigned Size;
+      if (ResultRegQualTys[i]->isRVVType() && TruncTy->isStructTy()) {
+        Size = cast<llvm::ScalableVectorType>(
+                   cast<llvm::StructType>(TruncTy)->getElementType(0))
+                   ->getScalarSizeInBits();
+      } else
+        Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Dest.getAddress(CGF).withElementType(ResultRegTypes[i]);
       if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
         Builder.CreateStore(Tmp, A);
@@ -2524,11 +2554,32 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       ResultRegIsFlagReg.push_back(IsFlagReg);
 
       llvm::Type *Ty = ConvertTypeForMem(QTy);
+      ResultTruncRegTypes.push_back(Ty);
+
+      // Expressing the type as a structure in inline asm calls will complicate
+      // the current code case, so instead, the return type is set to be a
+      // single scalable vector, then reconstructed with `vector.extract` and
+      // `insertvalue`. The type is derived here, and the reconstruction is done
+      // under EmitAsmStores.
+      if (QTy->isRVVType() && isa<llvm::StructType>(Ty)) {
+        // Flatten the structure into a single ScalableVectorType
+        auto *STy = cast<llvm::StructType>(Ty);
+        assert(STy->containsHomogeneousScalableVectorTypes() &&
+               isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+               "Dealing with RVV tuple (aggregate with homogeneous scalable "
+               "vectors");
+
+        auto *VecTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+        Ty = llvm::ScalableVectorType::get(VecTy->getScalarType(),
+                                           STy->getNumElements() *
+                                               VecTy->getMinNumElements());
+      }
+
       const bool RequiresCast = Info.allowsRegister() &&
           (getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
            Ty->isAggregateType());
 
-      ResultTruncRegTypes.push_back(Ty);
       ResultTypeRequiresCast.push_back(RequiresCast);
 
       if (RequiresCast) {
@@ -2551,6 +2602,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
         QualType InputTy = S.getInputExpr(InputNo)->getType();
         QualType OutputType = OutExpr->getType();
 
+        if ((InputTy->isRVVType() &&
+             isa<llvm::StructType>(ConvertType(InputTy))) ||
+            (OutputType->isRVVType() &&
+             isa<llvm::StructType>(ConvertType(OutputType)))) {
+          llvm_unreachable("FIXME: Deal with RVV type matching.");
+        }
+
         uint64_t InputSize = getContext().getTypeSize(InputTy);
         if (getContext().getTypeSize(OutputType) < InputSize) {
           // Form the asm to return the value as a larger integer or fp type.
@@ -2671,6 +2729,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       QualType OutputType = S.getOutputExpr(Output)->getType();
       QualType InputTy = InputExpr->getType();
 
+      if ((InputTy->isRVVType() &&
+           isa<llvm::StructType>(ConvertType(InputTy))) ||
+          (OutputType->isRVVType() &&
+           isa<llvm::StructType>(ConvertType(OutputType)))) {
+        llvm_unreachable("FIXME: Deal with RVV type matching.");
+      }
+
       if (getContext().getTypeSize(OutputType) >
           getContext().getTypeSize(InputTy)) {
         // Use ptrtoint as appropriate so that we can do our extension.
@@ -2701,10 +2766,40 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
           std::max((uint64_t)LargestVectorWidth,
                    VT->getPrimitiveSizeInBits().getKnownMinValue());
 
-    ArgTypes.push_back(Arg->getType());
-    ArgElemTypes.push_back(ArgElemType);
-    Args.push_back(Arg);
-    Constraints += InputConstraint;
+    // Expand RVV tuple type input operands.
+    if (InputExpr->getType()->isRVVType() && Arg->getType()->isStructTy()) {
+      std::string ExpandedInputContraint;
+
+      auto *STy = cast<llvm::StructType>(Arg->getType());
+
+      assert(STy->containsHomogeneousScalableVectorTypes() &&
+             isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+             "Only aggregate type of homogeneous scalable vectors is handled "
+             "here");
+
+      auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+      for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+           Idx != TupleSize; ++Idx) {
+        if (ExpandedInputContraint.size())
+          ExpandedInputContraint += ",";
+
+        ExpandedInputContraint += InputConstraint;
+        ArgTypes.push_back(VTy);
+        ArgElemTypes.push_back(ArgElemType);
+
+        llvm::Value *SubVec = Builder.CreateExtractValue(Arg, {Idx});
+
+        Args.push_back(SubVec);
+      }
+
+      Constraints += ExpandedInputContraint;
+    } else {
+      ArgTypes.push_back(Arg->getType());
+      ArgElemTypes.push_back(ArgElemType);
+      Args.push_back(Arg);
+      Constraints += InputConstraint;
+    }
   }
 
   // Append the "input" part of inout constraints.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
new file mode 100644
index 000000000000000..24f403c6625d0aa
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -0,0 +1,54 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+#include <riscv_vector.h>
+
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve32x -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @foo(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> asm "#NOP", "=^vr"() #[[ATTR2:[0-9]+]], !srcloc !4
+// CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT:    ret void
+//
+void foo() {
+  vint32m1x2_t v0;
+  asm ("#NOP" : "=vr" (v0));
+}
+
+// CHECK-LABEL: define dso_local void @bar(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } asm "#NOP", "=^vr,=^vr"() #[[ATTR2]], !srcloc !5
+// CHECK-NEXT:    [[ASMRESULT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[ASMRESULT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]], <vscale x 2 x i32> [[TMP7]], 1
+// CHECK-NEXT:    ret void
+//
+void bar() {
+  vint32m1x2_t v0, v2;
+  asm ("#NOP" : "=vr" (v0), "=vr" (v2));
+}
+
+// CHECK-LABEL: define dso_local void @baz(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, 0
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, 1
+// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr,^vr"(<vscale x 2 x i32> [[TMP0]], <vscale x 2 x i32> [[TMP1]]) #[[ATTR3:[0-9]+]], !srcloc !6
+// CHECK-NEXT:    ret void
+//
+void baz() {
+  vint32m1x2_t v2;
+  asm ("#NOP" :: "vr" (v2));
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/67109