mirror of
https://code.it4i.cz/sccs/easyconfigs-it4i.git
synced 2025-04-08 07:52:11 +01:00

new file: b/BLIS/BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch new file: b/BLIS/BLIS-2.2-GCCcore-10.2.0.eb new file: b/BLIS/BLIS-2.2-amd_fix-undefined-reference-blist-abort.patch new file: b/BLIS/BLIS-3.0.1-GCCcore-10.2.0.eb new file: b/Biopython/Biopython-1.72-foss-2020b-Python-2.7.18.eb new file: c/Cordax/Cordax-1.0-Python-2.7.18.eb new file: d/DFTB+/DFTB+-21.2-intel-2020b-Python-3.8.6.eb new file: d/DFTB+/DFTB+-21.2-intel-2020b-TB.eb new file: d/Dakota/Dakota-6.15.0-fix_lapack_detection.patch new file: d/Dakota/Dakota-6.15.0-foss-2021b.eb new file: d/Dakota/Dakota-6.15.0-intel-2021b.eb new file: e/ELPA/ELPA-2020.11.001-fosscuda-2020b.eb new file: f/FoldX/FoldX-5.0.eb new file: f/Forge/Forge-21.1.3.eb new file: g/GROMACS/GROMACS-2018.8-fosscuda-2020b-PLUMED-2.5.6-switch.eb new file: g/GROMACS/GROMACS-2021.4-foss-2020b-PLUMED-2.7.3.eb new file: h/HDF5/HDF5-1.12.1-NVHPC-21.11.eb new file: h/HDF5/HDF5-1.12.1-foss-2021b-parallel.eb new file: h/HDF5/HDF5-1.12.1-iimpi-2021b.eb new file: h/HDF5/HDF5-1.12.1-intel-2021b-parallel.eb new file: h/HyperQueue/HyperQueue-0.8.0.eb new file: h/Hypre/Hypre-2.23.0-intel-2020b.eb new file: h/h5py/h5py-3.6.0-intel-2021b.eb new file: l/LAPACK/LAPACK-3.10.0-GCC-11.2.0.eb new file: l/libFLAME/libFLAME-5.2.0-GCCcore-10.2.0.eb new file: m/MaSuRCA/MaSuRCA-4.0.7-foss-2020a-Perl-5.30.2.eb new file: m/Molpro/Molpro-mpp-2022.1.1.linux_x86_64_mpipr.eb new file: m/Molpro/Molpro-mpp-2022.1.1.linux_x86_64_sockets.eb new file: n/nompi/nompi-2022a.eb new file: o/ORCA/ORCA-5.0.3-OpenMPI-4.1.1.eb modified: o/Octopus/Octopus-11.3-intel-2020b-mpi.eb new file: o/OpenCV/OpenCV-4.5.3-foss-2021a-CUDA-11.3.1-contrib.eb new file: o/OpenCV/OpenCV-4.5.5-foss-2021a-CUDA-11.4.1-contrib.eb new file: o/OpenMPI/OpenMPI-4.1.1-GCC-11.2.0.eb new file: o/OpenMPI/OpenMPI-4.1.2-GCC-11.2.0-Java-1.8.0_221.eb new file: o/OpenMPI/OpenMPI-4.1.2-GCC-11.2.0.eb modified: p/PETSc/PETSc-3.14.4-intel-2020b.eb modified: p/PLUMED/PLUMED-2.5.6-fosscuda-2020b-patch.eb new file: p/PLUMED/PLUMED-2.5.6-fosscuda-2020b-switch.eb new file: p/PLUMED/PLUMED-2.7.3-foss-2020b.eb modified: p/phonopy/phonopy-2.12.0-conda.eb modified: q/QMCPACK/QMCPACK-3.11.0-intel-2020b-Python-3.8.6.eb new file: q/QMCPACK/QMCPACK-3.12.0-intel-2020b-Python-3.8.6.eb new file: q/QMCPACK/QMCPACK-3.12.0-intel-2021b-Python-3.9.6-lowopt.eb new file: q/QMCPACK/QMCPACK-3.13.0-intel-2020b-Python-3.8.6.eb new file: q/QuantumESPRESSO/QuantumESPRESSO-6.7-intel-2021a.eb new file: q/QuantumESPRESSO/QuantumESPRESSO-7.0-NVHPC-21.9.eb new file: r/rocm-cuda2hip/rocm-cuda2hip-4.3.1-gcccuda-2020b.eb new file: s/ScaLAPACK/ScaLAPACK-2.2-NVHPC-21.11.eb new file: t/Tango/Tango.eb new file: t/Tensorflow/TensorFlow-2.5.0-fosscuda-2020b.eb new file: v/VASP/VASP-5.4.1-24Jun15-intel-2020b.eb new file: w/Waltz/Waltz.eb new file: y/Yambo/Yambo-5.0.4-intel-2020a.eb
2220 lines
46 KiB
Diff
2220 lines
46 KiB
Diff
Taken from https://github.com/flame/blis/pull/544
|
|
Fixes a problem with DGEMM causing FPR signalling on Broadwell
|
|
See https://github.com/flame/blis/issues/486
|
|
|
|
Åke Sandgren, 20210916
|
|
|
|
commit 5191c43faccf45975f577c60b9089abee25722c9
|
|
Author: Devin Matthews <damatthews@smu.edu>
|
|
Date: Thu Sep 16 10:16:17 2021 -0500
|
|
|
|
Fix more copy-paste errors in the haswell gemmsup code.
|
|
|
|
Fixes #486.
|
|
|
|
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
|
|
index 4c6094b1..21dd3b89 100644
|
|
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
|
|
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
|
|
@@ -101,7 +101,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
begin_asm()
|
|
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
-
|
|
+
|
|
mov(var(a), r14) // load address of a.
|
|
mov(var(rs_a), r8) // load rs_a
|
|
//mov(var(cs_a), r9) // load cs_a
|
|
@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
|
|
lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), r12) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -172,19 +172,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
|
|
#endif
|
|
lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a
|
|
-
|
|
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
vfmadd231pd(ymm2, ymm3, ymm15)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rax ), ymm0)
|
|
@@ -250,7 +250,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -312,27 +312,27 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
vfmadd231pd(ymm2, ymm3, ymm15)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -343,7 +343,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
vmovupd(mem(rax, r8, 1), ymm1)
|
|
vmovupd(mem(rax, r8, 2), ymm2)
|
|
add(imm(4*8), rax) // a += 4*cs_b = 4*8;
|
|
-
|
|
+
|
|
vmovupd(mem(rbx ), ymm3)
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
vfmadd231pd(ymm1, ymm3, ymm5)
|
|
@@ -365,21 +365,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
vfmadd231pd(ymm2, ymm3, ymm15)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -387,12 +387,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rax ), xmm0)
|
|
vmovsd(mem(rax, r8, 1), xmm1)
|
|
vmovsd(mem(rax, r8, 2), xmm2)
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm3)
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
vfmadd231pd(ymm1, ymm3, ymm5)
|
|
@@ -414,12 +414,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
vfmadd231pd(ymm2, ymm3, ymm15)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -427,11 +427,11 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
label(.DPOSTACCUM)
|
|
|
|
|
|
-
|
|
- // ymm4 ymm7 ymm10 ymm13
|
|
+
|
|
+ // ymm4 ymm7 ymm10 ymm13
|
|
// ymm5 ymm8 ymm11 ymm14
|
|
// ymm6 ymm9 ymm12 ymm15
|
|
-
|
|
+
|
|
vhaddpd( ymm7, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm0 )
|
|
@@ -469,7 +469,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
// xmm6[0:3] = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
|
|
|
|
|
|
-
|
|
+
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
|
|
|
|
@@ -477,73 +477,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(ymm0, ymm4, ymm4) // scale by alpha
|
|
vmulpd(ymm0, ymm5, ymm5)
|
|
vmulpd(ymm0, ymm6, ymm6)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vfmadd231pd(mem(rcx), ymm3, ymm4)
|
|
vmovupd(ymm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vfmadd231pd(mem(rcx), ymm3, ymm5)
|
|
vmovupd(ymm5, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vfmadd231pd(mem(rcx), ymm3, ymm6)
|
|
vmovupd(ymm6, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovupd(ymm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovupd(ymm5, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovupd(ymm6, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
lea(mem(r12, rdi, 2), r12) //
|
|
@@ -560,7 +560,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
|
|
|
|
label(.DRETURN)
|
|
|
|
-
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
@@ -629,7 +629,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
// -------------------------------------------------------------------------
|
|
|
|
begin_asm()
|
|
-
|
|
+
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
|
|
mov(var(a), rax) // load address of a.
|
|
@@ -649,7 +649,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
|
|
lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), rcx) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -682,7 +682,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
//lea(mem(r14), rax) // rax = a;
|
|
//lea(mem(rdx), rbx) // rbx = b;
|
|
|
|
-
|
|
+
|
|
#if 1
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
|
|
@@ -690,18 +690,18 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
|
|
#endif
|
|
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -730,7 +730,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rax ), ymm0)
|
|
@@ -756,7 +756,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -807,27 +807,27 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -836,7 +836,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
vmovupd(mem(rax ), ymm0)
|
|
vmovupd(mem(rax, r8, 1), ymm1)
|
|
add(imm(4*8), rax) // a += 4*cs_b = 4*8;
|
|
-
|
|
+
|
|
vmovupd(mem(rbx ), ymm3)
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
vfmadd231pd(ymm1, ymm3, ymm5)
|
|
@@ -854,21 +854,21 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -876,11 +876,11 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rax ), xmm0)
|
|
vmovsd(mem(rax, r8, 1), xmm1)
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm3)
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
vfmadd231pd(ymm1, ymm3, ymm5)
|
|
@@ -898,12 +898,12 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
vfmadd231pd(ymm1, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -911,10 +911,10 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
label(.DPOSTACCUM)
|
|
|
|
|
|
-
|
|
- // ymm4 ymm7 ymm10 ymm13
|
|
+
|
|
+ // ymm4 ymm7 ymm10 ymm13
|
|
// ymm5 ymm8 ymm11 ymm14
|
|
-
|
|
+
|
|
vhaddpd( ymm7, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm0 )
|
|
@@ -943,75 +943,75 @@ void bli_dgemmsup_rd_haswell_asm_2x4
|
|
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
|
|
-
|
|
+
|
|
mov(var(alpha), rax) // load address of alpha
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(ymm0, ymm4, ymm4) // scale by alpha
|
|
vmulpd(ymm0, ymm5, ymm5)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vfmadd231pd(mem(rcx), ymm3, ymm4)
|
|
vmovupd(ymm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vfmadd231pd(mem(rcx), ymm3, ymm5)
|
|
vmovupd(ymm5, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovupd(ymm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovupd(ymm5, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
|
|
|
|
|
|
|
|
label(.DRETURN)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
@@ -1079,7 +1079,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
// -------------------------------------------------------------------------
|
|
|
|
begin_asm()
|
|
-
|
|
+
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
|
|
mov(var(a), rax) // load address of a.
|
|
@@ -1099,7 +1099,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
|
|
lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), rcx) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -1128,26 +1128,26 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
//lea(mem(r14), rax) // rax = a;
|
|
//lea(mem(rdx), rbx) // rbx = b;
|
|
|
|
-
|
|
+
|
|
#if 1
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
|
|
prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c
|
|
- prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
|
|
+ //prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
|
|
#endif
|
|
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -1170,7 +1170,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
add(imm(4*8), rbx) // b += 4*rs_b = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rax ), ymm0)
|
|
@@ -1191,7 +1191,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
#endif
|
|
@@ -1231,27 +1231,27 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
add(imm(4*8), rbx) // b += 4*rs_b = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -1259,7 +1259,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
|
|
vmovupd(mem(rax ), ymm0)
|
|
add(imm(4*8), rax) // a += 4*cs_b = 4*8;
|
|
-
|
|
+
|
|
vmovupd(mem(rbx ), ymm3)
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
|
|
@@ -1273,21 +1273,21 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
add(imm(4*8), rbx) // b += 4*rs_b = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -1295,10 +1295,10 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rax ), xmm0)
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm3)
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
|
|
@@ -1312,12 +1312,12 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm13)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -1325,9 +1325,9 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
label(.DPOSTACCUM)
|
|
|
|
|
|
-
|
|
- // ymm4 ymm7 ymm10 ymm13
|
|
-
|
|
+
|
|
+ // ymm4 ymm7 ymm10 ymm13
|
|
+
|
|
vhaddpd( ymm7, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm0 )
|
|
@@ -1339,15 +1339,15 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
|
|
|
|
|
|
- vhaddpd( ymm8, ymm5, ymm0 )
|
|
- vextractf128(imm(1), ymm0, xmm1 )
|
|
- vaddpd( xmm0, xmm1, xmm0 )
|
|
+ //vhaddpd( ymm8, ymm5, ymm0 )
|
|
+ //vextractf128(imm(1), ymm0, xmm1 )
|
|
+ //vaddpd( xmm0, xmm1, xmm0 )
|
|
|
|
- vhaddpd( ymm14, ymm11, ymm2 )
|
|
- vextractf128(imm(1), ymm2, xmm1 )
|
|
- vaddpd( xmm2, xmm1, xmm2 )
|
|
+ //vhaddpd( ymm14, ymm11, ymm2 )
|
|
+ //vextractf128(imm(1), ymm2, xmm1 )
|
|
+ //vaddpd( xmm2, xmm1, xmm2 )
|
|
|
|
- vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
|
|
+ //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
|
|
|
|
// xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
|
|
|
|
@@ -1355,67 +1355,67 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
|
|
-
|
|
+
|
|
mov(var(alpha), rax) // load address of alpha
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(ymm0, ymm4, ymm4) // scale by alpha
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vfmadd231pd(mem(rcx), ymm3, ymm4)
|
|
vmovupd(ymm4, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovupd(ymm4, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
|
|
|
|
|
|
|
|
label(.DRETURN)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
commit e3dc1954ffb5eee2a8b41fce85ba589f75770eea
|
|
Author: Devin Matthews <damatthews@smu.edu>
|
|
Date: Thu Sep 16 10:59:37 2021 -0500
|
|
|
|
Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell.
|
|
|
|
The fix is to use the same (valid) source register twice in the horizontal addition.
|
|
|
|
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
|
|
index 6e3c1a0e..457ef9f2 100644
|
|
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
|
|
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
|
|
@@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
// -------------------------------------------------------------------------
|
|
|
|
begin_asm()
|
|
-
|
|
+
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
-
|
|
+
|
|
mov(var(a), rax) // load address of a.
|
|
mov(var(rs_a), r8) // load rs_a
|
|
//mov(var(cs_a), r9) // load cs_a
|
|
@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
|
|
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), rcx) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -163,19 +163,19 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
|
|
prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
|
|
#endif
|
|
-
|
|
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -206,7 +206,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rbx ), ymm0)
|
|
@@ -233,7 +233,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -287,27 +287,27 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -336,21 +336,21 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -358,7 +358,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm0)
|
|
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
|
|
|
|
@@ -381,12 +381,12 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm14)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -399,28 +399,28 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
// ymm10
|
|
// ymm12
|
|
// ymm14
|
|
-
|
|
- vhaddpd( ymm5, ymm4, ymm0 )
|
|
+
|
|
+ vhaddpd( ymm4, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm4 )
|
|
|
|
- vhaddpd( ymm7, ymm6, ymm0 )
|
|
+ vhaddpd( ymm6, ymm6, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm6 )
|
|
|
|
- vhaddpd( ymm9, ymm8, ymm0 )
|
|
+ vhaddpd( ymm8, ymm8, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm8 )
|
|
|
|
- vhaddpd( ymm11, ymm10, ymm0 )
|
|
+ vhaddpd( ymm10, ymm10, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm10 )
|
|
|
|
- vhaddpd( ymm13, ymm12, ymm0 )
|
|
+ vhaddpd( ymm12, ymm12, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm12 )
|
|
|
|
- vhaddpd( ymm15, ymm14, ymm0 )
|
|
+ vhaddpd( ymm14, ymm14, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm14 )
|
|
|
|
@@ -435,114 +435,114 @@ void bli_dgemmsup_rd_haswell_asm_6x1
|
|
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
|
|
-
|
|
+
|
|
mov(var(alpha), rax) // load address of alpha
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
|
|
vmulpd(xmm0, xmm6, xmm6)
|
|
vmulpd(xmm0, xmm8, xmm8)
|
|
vmulpd(xmm0, xmm10, xmm10)
|
|
vmulpd(xmm0, xmm12, xmm12)
|
|
vmulpd(xmm0, xmm14, xmm14)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm4)
|
|
vmovsd(xmm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm6)
|
|
vmovsd(xmm6, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm8)
|
|
vmovsd(xmm8, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm10)
|
|
vmovsd(xmm10, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm12)
|
|
vmovsd(xmm12, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm14)
|
|
vmovsd(xmm14, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovsd(xmm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm6, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm8, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm10, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm12, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm14, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
-
|
|
+
|
|
|
|
|
|
|
|
label(.DRETURN)
|
|
|
|
-
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
@@ -613,9 +613,9 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
// -------------------------------------------------------------------------
|
|
|
|
begin_asm()
|
|
-
|
|
+
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
-
|
|
+
|
|
mov(var(a), rax) // load address of a.
|
|
mov(var(rs_a), r8) // load rs_a
|
|
//mov(var(cs_a), r9) // load cs_a
|
|
@@ -633,7 +633,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
|
|
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), rcx) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -671,19 +671,19 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
|
|
prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
|
|
#endif
|
|
-
|
|
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -705,7 +705,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm8)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rbx ), ymm0)
|
|
@@ -723,7 +723,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -759,27 +759,27 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm8)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -799,21 +799,21 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm8)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -821,7 +821,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm0)
|
|
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
|
|
|
|
@@ -835,12 +835,12 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm8)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -850,16 +850,16 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
// ymm4
|
|
// ymm6
|
|
// ymm8
|
|
-
|
|
- vhaddpd( ymm5, ymm4, ymm0 )
|
|
+
|
|
+ vhaddpd( ymm4, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm4 )
|
|
|
|
- vhaddpd( ymm7, ymm6, ymm0 )
|
|
+ vhaddpd( ymm6, ymm6, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm6 )
|
|
|
|
- vhaddpd( ymm9, ymm8, ymm0 )
|
|
+ vhaddpd( ymm8, ymm8, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm8 )
|
|
|
|
@@ -871,87 +871,87 @@ void bli_dgemmsup_rd_haswell_asm_3x1
|
|
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
|
|
-
|
|
+
|
|
mov(var(alpha), rax) // load address of alpha
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
|
|
vmulpd(xmm0, xmm6, xmm6)
|
|
vmulpd(xmm0, xmm8, xmm8)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm4)
|
|
vmovsd(xmm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm6)
|
|
vmovsd(xmm6, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm8)
|
|
vmovsd(xmm8, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovsd(xmm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm6, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm8, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
-
|
|
+
|
|
|
|
|
|
|
|
label(.DRETURN)
|
|
|
|
-
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
@@ -1022,9 +1022,9 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
// -------------------------------------------------------------------------
|
|
|
|
begin_asm()
|
|
-
|
|
+
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
-
|
|
+
|
|
mov(var(a), rax) // load address of a.
|
|
mov(var(rs_a), r8) // load rs_a
|
|
//mov(var(cs_a), r9) // load cs_a
|
|
@@ -1042,7 +1042,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
|
|
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), rcx) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -1078,19 +1078,19 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c
|
|
prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
|
|
#endif
|
|
-
|
|
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -1109,7 +1109,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm6)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rbx ), ymm0)
|
|
@@ -1124,7 +1124,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -1154,27 +1154,27 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm6)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -1191,21 +1191,21 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm6)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -1213,7 +1213,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm0)
|
|
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
|
|
|
|
@@ -1224,12 +1224,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm6)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -1238,12 +1238,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
|
|
// ymm4
|
|
// ymm6
|
|
-
|
|
- vhaddpd( ymm5, ymm4, ymm0 )
|
|
+
|
|
+ vhaddpd( ymm4, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm4 )
|
|
|
|
- vhaddpd( ymm7, ymm6, ymm0 )
|
|
+ vhaddpd( ymm6, ymm6, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm6 )
|
|
|
|
@@ -1254,78 +1254,78 @@ void bli_dgemmsup_rd_haswell_asm_2x1
|
|
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
|
|
-
|
|
+
|
|
mov(var(alpha), rax) // load address of alpha
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
|
|
vmulpd(xmm0, xmm6, xmm6)
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm4)
|
|
vmovsd(xmm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm6)
|
|
vmovsd(xmm6, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovsd(xmm4, mem(rcx))
|
|
add(rdi, rcx)
|
|
-
|
|
+
|
|
vmovsd(xmm6, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
-
|
|
+
|
|
|
|
|
|
|
|
label(.DRETURN)
|
|
|
|
-
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
@@ -1396,9 +1396,9 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
// -------------------------------------------------------------------------
|
|
|
|
begin_asm()
|
|
-
|
|
+
|
|
//vzeroall() // zero all xmm/ymm registers.
|
|
-
|
|
+
|
|
mov(var(a), rax) // load address of a.
|
|
mov(var(rs_a), r8) // load rs_a
|
|
//mov(var(cs_a), r9) // load cs_a
|
|
@@ -1416,7 +1416,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
|
|
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
|
|
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
|
|
-
|
|
+
|
|
|
|
mov(var(c), rcx) // load address of c
|
|
mov(var(rs_c), rdi) // load rs_c
|
|
@@ -1450,19 +1450,19 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
//lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c;
|
|
prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c
|
|
#endif
|
|
-
|
|
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
mov(var(k_iter16), rsi) // i = k_iter16;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKITER4) // if i == 0, jump to code that
|
|
// contains the k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER16) // MAIN LOOP
|
|
-
|
|
-
|
|
+
|
|
+
|
|
// ---------------------------------- iteration 0
|
|
|
|
#if 0
|
|
@@ -1478,7 +1478,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
|
|
-
|
|
+
|
|
// ---------------------------------- iteration 1
|
|
|
|
vmovupd(mem(rbx ), ymm0)
|
|
@@ -1490,7 +1490,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
|
|
|
|
// ---------------------------------- iteration 2
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -1514,27 +1514,27 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
|
|
-
|
|
+
|
|
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER16) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DCONSIDKITER4)
|
|
-
|
|
+
|
|
mov(var(k_iter4), rsi) // i = k_iter4;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
|
|
// considers k_left1 loop.
|
|
// else, we prepare to enter k_iter4 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
label(.DLOOPKITER4) // EDGE LOOP (ymm)
|
|
-
|
|
+
|
|
#if 0
|
|
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
|
|
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
|
|
@@ -1548,21 +1548,21 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKITER4) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
label(.DCONSIDKLEFT1)
|
|
-
|
|
+
|
|
mov(var(k_left1), rsi) // i = k_left1;
|
|
test(rsi, rsi) // check i via logical AND.
|
|
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
|
// else, we prepare to enter k_left1 loop.
|
|
-
|
|
-
|
|
+
|
|
+
|
|
|
|
|
|
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
|
|
@@ -1570,7 +1570,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
// using the xmm registers would zero out the
|
|
// high bits of the destination registers,
|
|
// which would destory intermediate results.
|
|
-
|
|
+
|
|
vmovsd(mem(rbx ), xmm0)
|
|
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
|
|
|
|
@@ -1578,12 +1578,12 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
|
|
vfmadd231pd(ymm0, ymm3, ymm4)
|
|
|
|
-
|
|
+
|
|
dec(rsi) // i -= 1;
|
|
jne(.DLOOPKLEFT1) // iterate again if i != 0.
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
|
|
|
|
|
|
@@ -1591,8 +1591,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
label(.DPOSTACCUM)
|
|
|
|
// ymm4
|
|
-
|
|
- vhaddpd( ymm5, ymm4, ymm0 )
|
|
+
|
|
+ vhaddpd( ymm4, ymm4, ymm0 )
|
|
vextractf128(imm(1), ymm0, xmm1 )
|
|
vaddpd( xmm0, xmm1, xmm4 )
|
|
|
|
@@ -1602,69 +1602,69 @@ void bli_dgemmsup_rd_haswell_asm_1x1
|
|
|
|
//mov(var(rs_c), rdi) // load rs_c
|
|
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
|
|
-
|
|
+
|
|
mov(var(alpha), rax) // load address of alpha
|
|
mov(var(beta), rbx) // load address of beta
|
|
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
|
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
|
-
|
|
+
|
|
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
//mov(var(cs_c), rsi) // load cs_c
|
|
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
// now avoid loading C if beta == 0
|
|
-
|
|
+
|
|
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
|
|
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
|
|
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORED)
|
|
-
|
|
|
|
- vmovsd(mem(rcx), xmm0)
|
|
+
|
|
+ vmovsd(mem(rcx), xmm0)
|
|
vfmadd231pd(xmm0, xmm3, xmm4)
|
|
vmovsd(xmm4, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
jmp(.DDONE) // jump to end.
|
|
-
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DBETAZERO)
|
|
-
|
|
|
|
-
|
|
+
|
|
+
|
|
label(.DROWSTORBZ)
|
|
-
|
|
-
|
|
+
|
|
+
|
|
vmovsd(xmm4, mem(rcx))
|
|
//add(rdi, rcx)
|
|
-
|
|
|
|
-
|
|
-
|
|
-
|
|
+
|
|
+
|
|
+
|
|
+
|
|
label(.DDONE)
|
|
-
|
|
+
|
|
|
|
|
|
|
|
label(.DRETURN)
|
|
|
|
-
|
|
+
|
|
|
|
end_asm(
|
|
: // output operands (none)
|
|
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
|
|
index 21dd3b89..516bfced 100644
|
|
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
|
|
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
|
|
@@ -1338,17 +1338,6 @@ void bli_dgemmsup_rd_haswell_asm_1x4
|
|
|
|
vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
|
|
|
|
-
|
|
- //vhaddpd( ymm8, ymm5, ymm0 )
|
|
- //vextractf128(imm(1), ymm0, xmm1 )
|
|
- //vaddpd( xmm0, xmm1, xmm0 )
|
|
-
|
|
- //vhaddpd( ymm14, ymm11, ymm2 )
|
|
- //vextractf128(imm(1), ymm2, xmm1 )
|
|
- //vaddpd( xmm2, xmm1, xmm2 )
|
|
-
|
|
- //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
|
|
-
|
|
// xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
|
|
|
|
|