[clang] [Clang][RISCV] Handle RVV tuple types correctly as OutputOperands for inline asm (PR #67109)

Yueh-Ting Chen via cfe-commits cfe-commits at lists.llvm.org
Fri Sep 22 02:44:39 PDT 2023


https://github.com/eopXD created https://github.com/llvm/llvm-project/pull/67109

This PR is based on #67018. This PR fixes compilation issue for RVV tuple types as InputOperands for inline asm.

---

Currently the compiler generates https://godbolt.org/z/djebPfqxf for tuple type as inline asm inputs and cannot be code generated successfully https://godbolt.org/z/na7T19Krc. This PR fixes Clang by generating https://godbolt.org/z/MsovoxbY9 instead, which can be successfully handled by the back-end.

A follow-up PR will handle interactions of RVV tuple type InputOperands and OutputOperands correctly.

>From a26eb9fe0c09fc0fd504d261874630d8b5edae26 Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen at gmail.com>
Date: Thu, 21 Sep 2023 06:34:57 -0700
Subject: [PATCH 1/3] [Clang][RISCV] Handle RVV tuple types correctly as
 OutputOperand for inline asm

The RVV tuple type maps to an aggregate type with homogeneous scalable
vectors. EmitAsmStmt does not handle this correctly and this commit
attempts to fix it.

Expressing the type as a structure in inline asm calls will complicate
the current code base, so instead, the return type is set to be
a single scalable vector, then reconstructed with `vector.extract` and
`insertvalue`.

A follow-up commit will deal with details when associated with
InputOperands.
---
 clang/lib/CodeGen/CGStmt.cpp                  | 69 ++++++++++++++++++-
 .../rvv-inline-asm.c                          | 41 +++++++++++
 2 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 6674aa2409a5947..948539ea546084e 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/PrettyStackTrace.h"
@@ -29,10 +30,13 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Assumptions.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include <optional>
 
@@ -2392,6 +2396,26 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
         Tmp = Builder.CreateZExtOrTrunc(Tmp, TruncTy);
       } else if (TruncTy->isVectorTy()) {
         Tmp = Builder.CreateBitCast(Tmp, TruncTy);
+      } else if (TruncTy->isStructTy() && ResultRegQualTys[i]->isRVVType()) {
+        auto *STy = cast<llvm::StructType>(TruncTy);
+        auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+        assert(STy->containsHomogeneousScalableVectorTypes() &&
+               "Must be dealing with RVV tuple type");
+
+        unsigned MinElts = VTy->getElementCount().getKnownMinValue();
+        llvm::Value *StructValue = llvm::PoisonValue::get(STy);
+
+        for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+             Idx != TupleSize; ++Idx) {
+          llvm::Value *IdxValue =
+              llvm::ConstantInt::get(CGM.Int64Ty, Idx * MinElts);
+          llvm::Value *SubVec = Builder.CreateExtractVector(VTy, Tmp, IdxValue);
+
+          StructValue = Builder.CreateInsertValue(StructValue, SubVec, Idx);
+        }
+
+        Tmp = StructValue;
       }
     }
 
@@ -2399,7 +2423,13 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
     // ResultTypeRequiresCast elements correspond to the first
     // ResultTypeRequiresCast.size() elements of RegResults.
     if ((i < ResultTypeRequiresCast.size()) && ResultTypeRequiresCast[i]) {
-      unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
+      unsigned Size;
+      if (ResultRegQualTys[i]->isRVVType() && TruncTy->isStructTy()) {
+        Size = cast<llvm::ScalableVectorType>(
+                   cast<llvm::StructType>(TruncTy)->getElementType(0))
+                   ->getScalarSizeInBits();
+      } else
+        Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Dest.getAddress(CGF).withElementType(ResultRegTypes[i]);
       if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
         Builder.CreateStore(Tmp, A);
@@ -2524,11 +2554,32 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       ResultRegIsFlagReg.push_back(IsFlagReg);
 
       llvm::Type *Ty = ConvertTypeForMem(QTy);
+      ResultTruncRegTypes.push_back(Ty);
+
+      // Expressing the type as a structure in inline asm calls will complicate
+      // the current code case, so instead, the return type is set to be a
+      // single scalable vector, then reconstructed with `vector.extract` and
+      // `insertvalue`. The type is derived here, and the reconstruction is done
+      // under EmitAsmStores.
+      if (QTy->isRVVType() && isa<llvm::StructType>(Ty)) {
+        // Flatten the structure into a single ScalableVectorType
+        auto *STy = cast<llvm::StructType>(Ty);
+        assert(STy->containsHomogeneousScalableVectorTypes() &&
+               isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+               "Dealing with RVV tuple (aggregate with homogeneous scalable "
+               "vectors");
+
+        auto *VecTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+        Ty = llvm::ScalableVectorType::get(VecTy->getScalarType(),
+                                           STy->getNumElements() *
+                                               VecTy->getMinNumElements());
+      }
+
       const bool RequiresCast = Info.allowsRegister() &&
           (getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
            Ty->isAggregateType());
 
-      ResultTruncRegTypes.push_back(Ty);
       ResultTypeRequiresCast.push_back(RequiresCast);
 
       if (RequiresCast) {
@@ -2551,6 +2602,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
         QualType InputTy = S.getInputExpr(InputNo)->getType();
         QualType OutputType = OutExpr->getType();
 
+        if ((InputTy->isRVVType() &&
+             isa<llvm::StructType>(ConvertType(InputTy))) ||
+            (OutputType->isRVVType() &&
+             isa<llvm::StructType>(ConvertType(OutputType)))) {
+          llvm_unreachable("FIXME: Deal with RVV type matching.");
+        }
+
         uint64_t InputSize = getContext().getTypeSize(InputTy);
         if (getContext().getTypeSize(OutputType) < InputSize) {
           // Form the asm to return the value as a larger integer or fp type.
@@ -2671,6 +2729,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       QualType OutputType = S.getOutputExpr(Output)->getType();
       QualType InputTy = InputExpr->getType();
 
+      if ((InputTy->isRVVType() &&
+           isa<llvm::StructType>(ConvertType(InputTy))) ||
+          (OutputType->isRVVType() &&
+           isa<llvm::StructType>(ConvertType(OutputType)))) {
+        llvm_unreachable("FIXME: Deal with RVV type matching.");
+      }
+
       if (getContext().getTypeSize(OutputType) >
           getContext().getTypeSize(InputTy)) {
         // Use ptrtoint as appropriate so that we can do our extension.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
new file mode 100644
index 000000000000000..cad4f8ed5dcbd48
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -0,0 +1,41 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+#include <riscv_vector.h>
+
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve32x -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @foo(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> asm "#NOP", "=^vr"() #[[ATTR2:[0-9]+]], !srcloc !4
+// CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT:    ret void
+//
+void foo() {
+  vint32m1x2_t v0;
+  asm ("#NOP" : "=vr" (v0));
+}
+
+// CHECK-LABEL: define dso_local void @bar(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } asm "#NOP", "=^vr,=^vr"() #[[ATTR2]], !srcloc !5
+// CHECK-NEXT:    [[ASMRESULT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[ASMRESULT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]], <vscale x 2 x i32> [[TMP7]], 1
+// CHECK-NEXT:    ret void
+//
+void bar() {
+  vint32m1x2_t v0, v2;
+  asm ("#NOP" : "=vr" (v0), "=vr" (v2));
+}

>From c30f2ac73ab358723cbff0c0552ad5afafadeb8a Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen at gmail.com>
Date: Fri, 22 Sep 2023 02:30:58 -0700
Subject: [PATCH 2/3] [Clang][RISCV] Precommit to show current codegen for
 inline asm of RVV tuple type

The generated LLVM cannot be successfully handled because function
argument of tuple type is an aggregate of scalable vectors. It needs to
be flattened into separate arguments.
---
 .../RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
index cad4f8ed5dcbd48..8447aee1b715403 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -39,3 +39,14 @@ void bar() {
   vint32m1x2_t v0, v2;
   asm ("#NOP" : "=vr" (v0), "=vr" (v2));
 }
+
+// CHECK-LABEL: define dso_local void @baz(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr"({ <vscale x 2 x i32>, <vscale x 2 x i32> } undef) #[[ATTR3:[0-9]+]], !srcloc !6
+// CHECK-NEXT:    ret void
+//
+void baz() {
+  vint32m1x2_t v2;
+  asm ("#NOP" :: "vr" (v2));
+}

>From 38de145299f18b26e7195bf620e9d50dbacb941a Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen at gmail.com>
Date: Fri, 22 Sep 2023 02:32:38 -0700
Subject: [PATCH 3/3] [Clang][RISCV] Handle RVV tuple types correctly as
 InputOperand for inline asm

This commit flatten the input operand from an aggregate structure into
separate arguments for RVV tuple types.
---
 clang/lib/CodeGen/CGStmt.cpp                  | 38 +++++++++++++++++--
 .../rvv-inline-asm.c                          |  4 +-
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 948539ea546084e..4a2bdde56c5704e 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2766,10 +2766,40 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
           std::max((uint64_t)LargestVectorWidth,
                    VT->getPrimitiveSizeInBits().getKnownMinValue());
 
-    ArgTypes.push_back(Arg->getType());
-    ArgElemTypes.push_back(ArgElemType);
-    Args.push_back(Arg);
-    Constraints += InputConstraint;
+    // Expand RVV tuple type input operands.
+    if (InputExpr->getType()->isRVVType() && Arg->getType()->isStructTy()) {
+      std::string ExpandedInputContraint;
+
+      auto *STy = cast<llvm::StructType>(Arg->getType());
+
+      assert(STy->containsHomogeneousScalableVectorTypes() &&
+             isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+             "Only aggregate type of homogeneous scalable vectors is handled "
+             "here");
+
+      auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+      for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+           Idx != TupleSize; ++Idx) {
+        if (ExpandedInputContraint.size())
+          ExpandedInputContraint += ",";
+
+        ExpandedInputContraint += InputConstraint;
+        ArgTypes.push_back(VTy);
+        ArgElemTypes.push_back(ArgElemType);
+
+        llvm::Value *SubVec = Builder.CreateExtractValue(Arg, {Idx});
+
+        Args.push_back(SubVec);
+      }
+
+      Constraints += ExpandedInputContraint;
+    } else {
+      ArgTypes.push_back(Arg->getType());
+      ArgElemTypes.push_back(ArgElemType);
+      Args.push_back(Arg);
+      Constraints += InputConstraint;
+    }
   }
 
   // Append the "input" part of inout constraints.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
index 8447aee1b715403..24f403c6625d0aa 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -43,7 +43,9 @@ void bar() {
 // CHECK-LABEL: define dso_local void @baz(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr"({ <vscale x 2 x i32>, <vscale x 2 x i32> } undef) #[[ATTR3:[0-9]+]], !srcloc !6
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, 0
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, 1
+// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr,^vr"(<vscale x 2 x i32> [[TMP0]], <vscale x 2 x i32> [[TMP1]]) #[[ATTR3:[0-9]+]], !srcloc !6
 // CHECK-NEXT:    ret void
 //
 void baz() {



More information about the cfe-commits mailing list