[llvm-dev] Fwd: KNL Assembly Code for Matrix Multiplication

Fri Jun 30 15:47:15 PDT 2017

---------- Forwarded message ----------
From: hameeza ahmed <hahmed2305 at gmail.com>
Date: Sat, Jul 1, 2017 at 3:46 AM
Subject: Re: [llvm-dev] KNL Assembly Code for Matrix Multiplication
To: Craig Topper <craig.topper at gmail.com>

Thank You.

in this step;
vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 =
[8,9,10,11,12,13,14,15]
the indexes are 64 bit but the element stored at these position is 32 bit
since we are dealing with integers and ir also shows this.
here we are loading 32 bit value from those 64 bit indexes which means
zmm22 will hold values 32 bit from these 64 bit position so there is
capacity of 16 32 bit elements then why all this??

this is mentioned in IR as

  %5 = getelementptr inbounds [1000 x i32], [1000 x i32]* %0, i64
%indvars.iv34, i64 %4
  %6 = bitcast i32* %5 to <16 x i32>*
  %wide.load = load <16 x i32>, <16 x i32>* %6, align 4, !tbaa !1

here indvars are 64 bit values but the values loaded from these indexes
(step 3) is 32 bit???

Please correct me.

On Fri, Jun 30, 2017 at 8:59 PM, Craig Topper <craig.topper at gmail.com>
wrote:

> Some comments inline, I'll need to look more later.
>
> ~Craig
>
> On Fri, Jun 30, 2017 at 5:28 AM, hameeza ahmed via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
>
>> Hello, I want some help in understanding knl intel assembly of matrix
>> multiplication code. some of the things are not clear;
>>
>> here .c file:
>>
>> #include <stdio.h>
>> #define N 1000
>>
>> // This function multiplies A[][] and B[][], and stores
>> // the result in C[][]
>> void multiply(int A[][N], int B[][N], int C[][N])
>> {
>>     int i, j, k, r;
>>     for (i = 0; i < N; i++)
>>     {
>>         for (j = 0; j < N; j++)
>>         {
>>             r = 0;
>>             for (k = 0; k < N; k++) {
>>                 r += A[i][k]*B[k][j];}
>>                 C[i][j] = r;
>>
>>         }
>>
>>     }
>> }
>>
>> here .s file: * the code that i want to ask is in red color.*
>>
>> .text
>> .intel_syntax noprefix
>> .file "matn_o3.ll"
>> .section .rodata,"a", at progbits
>> .p2align 6
>> .LCPI0_0:
>> .quad 8                       # 0x8
>> .quad 9                       # 0x9
>> .quad 10                      # 0xa
>> .quad 11                      # 0xb
>> .quad 12                      # 0xc
>> .quad 13                      # 0xd
>> .quad 14                      # 0xe
>> .quad 15                      # 0xf
>> .LCPI0_1:
>> .quad 0                       # 0x0
>> .quad 1                       # 0x1
>> .quad 2                       # 0x2
>> .quad 3                       # 0x3
>> .quad 4                       # 0x4
>> .quad 5                       # 0x5
>> .quad 6                       # 0x6
>> .quad 7                       # 0x7
>> .section .rodata.cst8,"aM", at progbits,8
>> .p2align 3
>> .LCPI0_2:
>> .quad 4000                    # 0xfa0
>> .LCPI0_3:
>> .quad 64000                   # 0xfa00
>> .LCPI0_4:
>> .quad 128000                  # 0x1f400
>> .LCPI0_5:
>> .quad 192000                  # 0x2ee00
>> .LCPI0_6:
>> .quad 64                      # 0x40
>> .text
>> .globl multiply
>> .p2align 4, 0x90
>> .type multiply, at function
>> multiply:                               # @multiply
>> .cfi_startproc
>> # BB#0:
>> push rbp
>> .Lcfi0:
>> .cfi_def_cfa_offset 16
>> push r15
>> .Lcfi1:
>> .cfi_def_cfa_offset 24
>> push r14
>> .Lcfi2:
>> .cfi_def_cfa_offset 32
>> push r12
>> .Lcfi3:
>> .cfi_def_cfa_offset 40
>> push rbx
>> .Lcfi4:
>> .cfi_def_cfa_offset 48
>> .Lcfi5:
>> .cfi_offset rbx, -48
>> .Lcfi6:
>> .cfi_offset r12, -40
>> .Lcfi7:
>> .cfi_offset r14, -32
>> .Lcfi8:
>> .cfi_offset r15, -24
>> .Lcfi9:
>> .cfi_offset rbp, -16
>> lea r8, [rdi + 3856]
>> xor r9d, r9d
>> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 =
>> [8,9,10,11,12,13,14,15]
>> vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]
>> vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
>> vpbroadcastq zmm3, rsi
>> add rsi, 3856000
>> vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
>> vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
>> vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
>> kxnorw k1, k0, k0
>> kshiftrw k1, k1, 8
>> vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
>> .p2align 4, 0x90
>> .LBB0_1:                                # %.preheader26
>>                                         # =>This Loop Header: Depth=1
>>                                         #     Child Loop BB0_2 Depth 2
>>                                         #       Child Loop BB0_3 Depth 3
>>                                         #       Child Loop BB0_5 Depth 3
>> xor r11d, r11d
>> .p2align 4, 0x90
>> .LBB0_2:                                # %.preheader
>>                                         #   Parent Loop BB0_1 Depth=1
>>                                         # =>  This Loop Header: Depth=2
>>                                         #       Child Loop BB0_3 Depth 3
>>                                         #       Child Loop BB0_5 Depth 3
>> vpxord zmm8, zmm8, zmm8
>> mov ecx, 960
>> vmovdqa64 zmm9, zmm23
>> vmovdqa64 zmm10, zmm22
>> vpxord zmm11, zmm11, zmm11
>> vpxord zmm12, zmm12, zmm12
>> vpxord zmm13, zmm13, zmm13
>> .p2align 4, 0x90
>> .LBB0_3:                                # %vector.body
>>                                         #   Parent Loop BB0_1 Depth=1
>>                                         #     Parent Loop BB0_2 Depth=2
>>                                         # =>    This Inner Loop Header:
>> Depth=3
>>                                         # this bb will run 15 times
>> vmovq rax, xmm9
>> imul r10, r9, 4000
>> lea rbx, [rdi + r10]
>> *vpmuludq zmm14, zmm10, zmm2       ; this is BB for vector here we have
>> to do gather for B due to arbitrary addresses so here
>> zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in
>> these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B
>> we need to multiple indexes with stride=4000. i know here these indexes are
>> 64 bit but the values stored in these locations are 32 bits then  the load
>> using zmm10 index will give 8 elements of 32 bits present in these
>> locations, so do the registers contain 8 elements of 32 bits present at
>> specified indexes?? so after multiplication we get indexes for higher 8
>> elements of B i.e [3200,3600,40000,.......54000].*
>>
>> * vpsrlq zmm15, zmm10, 32              ; i dont understand the need for
>> this step, please explain the purpose of all these steps. here vpsrlq will
>> shift right zmm10 values by 256 bits (32*8)....zmmm10 initially=**[8,9,10,11,12,13,14,15].
>> it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me
>> the purpose of this step.*
>> * vpmuludq zmm15, zmm15, zmm2  ;    similarly **dont understand the need
>> for this step.*
>> * vpsllq zmm15, zmm15, 32    ; **dont understand the need for this step*
>> * vpaddq zmm14, zmm14, zmm3  ; *
>> * vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this step*
>>
>
> vpsrlq zmm15, zmm10, 32 shifts every 64-bit element in zmm10 right by 32
> bits. I believe this effectively taking every odd numbered 32-bit element
> and moving them to the next lowest even numbered 32-bit element.
>
> vmuludq multiplies all even numbered 32-bit elements and creates 64-bit
> results.
>
> The combination of the shifts, vpmuludq, and vpaddq is to multiply 64-bit
> elements and create a 64-bit elements result. We don't have an instruction
> for this so we have to multiply the low 32-bits of each element and the
> high 32-bits of each element separately and add the results together. Looks
> like we determined that the high 32-bits of one of the inputs is all zeros
> so we skipped 1 of the multiplies and adds that would normally be required
> for this operation.
>
>
>
>> * vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes
>> whats the need of this step?*
>> * vpsllq zmm15, zmm15, 2   ; **dont understand the need for this step*
>> * vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this step*
>> * vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8
>> elements of B indexes are computed as Zmm16=[0,4000,8000,.......28000]*
>> * vpsrlq zmm17, zmm9, 32   **; **dont understand the need for this step*
>> * vpmuludq zmm17, zmm17, zmm2  **; **dont understand the need for this
>> step*
>> * vpsllq zmm17, zmm17, 32  **; **dont understand the need for this step*
>> * vpaddq zmm16, zmm16, zmm3  *
>> * vpaddq zmm16, zmm17, zmm16  **; **dont understand the need for this
>> step*
>> * vpaddq zmm15, zmm16, zmm15  **; **dont understand the need for this
>> step*
>> * vpaddq zmm16, zmm15, zmm4*
>> * vpaddq zmm17, zmm14, zmm4*
>> * vpaddq zmm18, zmm15, zmm5*
>> * vpaddq zmm19, zmm14, zmm5*
>> * vpaddq zmm20, zmm15, zmm6*
>> * vpaddq zmm21, zmm14, zmm6*
>> * kmovw k2, k1  **; **dont understand the need for this step*
>>
>
> The gather instruction requires a mask of which elements to read. When the
> gather completes, if there are no faults it will have written the mask
> register to 0. So it needs to reloaded for each gather.
>
>
>> * vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8
>> indexes ( or values at these 8 indexes???) so it will load 8 elements not
>> 16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by the
>> above computation these indexes are changes??*
>> * kxnorw k2, k0, k0  **; **dont understand the need for this step*
>>
> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm15]   **; **here again issues
>> with index zmm15. it should be **[0,4000,8000,.......28000] but its
>> different due to above computation.*
>> * vinserti64x4 zmm0, zmm14, ymm0, 1*
>> * kmovw k2, k1*
>> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]*
>> * kxnorw k2, k0, k0*
>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]*
>> * vinserti64x4 zmm14, zmm15, ymm14, 1*
>> * kmovw k2, k1*
>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]*
>> * kxnorw k2, k0, k0*
>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]*
>> * vinserti64x4 zmm15, zmm16, ymm15, 1*
>> * kmovw k2, k1*
>> * vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]*
>> * kxnorw k2, k0, k0*
>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]*
>> * vinserti64x4 zmm1, zmm16, ymm1, 1*
>> * vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]*
>> vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64]
>> vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128]
>> vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
>> vpaddd zmm8, zmm0, zmm8
>> vpaddd zmm11, zmm14, zmm11
>> vpaddd zmm12, zmm15, zmm12
>> vpaddd zmm13, zmm1, zmm13
>> vpaddq zmm9, zmm9, zmm7        #zmm7=64
>> vpaddq zmm10, zmm10, zmm7
>> add rcx, -64     #decrement counter by 64
>> jne .LBB0_3       # if rcx not equal to zero goto .lbbo_3
>> # BB#4:                                 # %middle.block
>>                                         #   in Loop: Header=BB0_2 Depth=2
>> vpaddd zmm0, zmm11, zmm8
>> vpaddd zmm0, zmm12, zmm0
>> vpaddd zmm0, zmm13, zmm0
>> *vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1]   ;
>> please explain how shuffle instructions work here. i know of llvm ir
>> shuffle, but these assembly ones are difficult for me to understand*
>>
>
> You have to look at the size of the register being mentioned and the
> number of elements in brackets. In this case the regsiter is 512-bits and
> the number of elements is 8. 512/8 is 64. So its a shuffle of a v8i64
> vector. Then we read the element numbers from left to write just like the
> shuffle IR instruction.
>
> So element 0 of zmm1 gets the value of element 4 of zmm0. Element 1 of
> zmm1 gets the value of element 5 of zmm5, etc.
>
>
>> * vpaddd zmm0, zmm0, zmm1*
>> * vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]*
>> * vpaddd zmm0, zmm0, zmm1*
>> * vpshufd zmm1, zmm0, 238         # zmm1 =
>> zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]*
>> * vpaddd zmm0, zmm0, zmm1*
>> * vpshufd zmm1, zmm0, 229         # zmm1 =
>> zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]*
>> vpaddd zmm0, zmm0, zmm1
>> vmovd ebx, xmm0
>> mov rax, r8
>> xor r14d, r14d
>> .p2align 4, 0x90
>> .LBB0_5:                                #   Parent Loop BB0_1 Depth=1
>>                                         #     Parent Loop BB0_2 Depth=2
>>                                         # =>    This Inner Loop Header:
>> Depth=3
>> lea r15, [rsi + r14]
>> mov r12d, dword ptr [r15 + 4*r11 - 16000]
>> imul r12d, dword ptr [rax - 16]
>> mov ecx, dword ptr [r15 + 4*r11 - 12000]
>> imul ecx, dword ptr [rax - 12]
>> mov ebp, dword ptr [r15 + 4*r11 - 8000]
>> imul ebp, dword ptr [rax - 8]
>> add r12d, ebx
>> add ecx, r12d
>> add ebp, ecx
>> mov ecx, dword ptr [r15 + 4*r11 - 4000]
>> imul ecx, dword ptr [rax - 4]
>> add ecx, ebp
>> mov ebx, dword ptr [r15 + 4*r11]
>> imul ebx, dword ptr [rax]
>> add ebx, ecx
>> add r14, 20000
>> add rax, 20
>> cmp r14, 160000
>> jne .LBB0_5
>> # BB#6:                                 # %.loopexit
>>                                         #   in Loop: Header=BB0_2 Depth=2
>> add r10, rdx                #rdx is c[][]
>> mov dword ptr [r10 + 4*r11], ebx
>> inc r11
>> cmp r11, 1000
>> jne .LBB0_2
>> # BB#7:                                 #   in Loop: Header=BB0_1 Depth=1
>> inc r9
>> add r8, 4000
>> cmp r9, 1000
>> jne .LBB0_1
>> # BB#8:
>> pop rbx
>> pop r12
>> pop r14
>> pop r15
>> pop rbp
>> ret
>>
>>
>> Looking forward to your reply
>>
>> Thank You
>>
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170701/954b2788/attachment.html>