2024-07-25 10:27:17 +02:00

5912 lines
209 KiB
Fortran

! {{{ ptcore
************************************************************************
subroutine ptcore(t,tt,fij,fab,co,first,last,dgroup,bsymmv,
$symmocc,iscrs,localcc,ui,esdi,et,iui,dfnb,ittend,
$ijij,ijia,ijab,ccsdalg,imo,lccoporder,ccmem,talg,
$irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg,
$iout,maxcor,imem,dcore,iimem,icore,wsize,gbasfile,ecc,eref,locnoc,
$mpi_rank,mpi_size,imove, mpi_ccsd,
$ccsdmkl, ptthreads, omp_max_threads, mkl_max_threads,
$ccsdrest, ptrest, bcast_comm, ccsd_communicator, emp2corr,
$lno, lnaf, eppl_correction, emp2, emp2full, master_rank,
$master_thread,lf12,ppl_calc)
************************************************************************
implicit none
#if defined(MPI)
include "mpif.h"
#endif
integer irecln,ibufln,nbasis,nocc,nvirt,dgroup,nirmax
integer bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
integer iout,maxcor,imem,iimem,wsize,gbasfile,locnoc,vsize
real*8 dcore(*), ecc, eref, tfact, tscale(nocc), ets
integer icore(*), ccsd_communicator
integer a,i,j,nquad,iquad,nqmin, bcast_comm
integer ttind,ccmem(8)
real*8 tt(nvirt,nvirt,(nocc+1)*nocc/2)
real*8 t(nvirt,nocc)
real*8 fab(nvirt,nvirt),fij(nocc,nocc)
real*8 ui(nocc)
real*8 esdi, emp2corr, emp2, emp2full
character*8 ccsdalg,lccoporder
integer dfnb, ijij, ijia, ijab
integer itemp, imove, master_rank
integer mpi_rank, mpi_size, mpi_err
logical mpi_ccsd, lno, lnaf, master_thread,lf12
character*3 ccsdmkl
character*4 ccsdrest
logical ptrest,ppl_calc
integer ptthreads, omp_max_threads, mkl_max_threads
integer co(nbasis)
integer first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
integer abcistoremin,jlen,iIabIJ,ittl2,ittl3,ittend,ovmax
integer imem1,avmem,iscrs,dblalloc,iui
common/memcom/ imem1
C Vars for (T) corr
integer ifdo,ifdv,itnew,ittnew,iaijk,iabij,iv,iw
integer iabci,iscro,itrr,rrd,talgoccmin
c character*1 ison
character*4 localcc,talg
character*8 dfintran
character*16 laptol
integer iv1,iv2,iv3,iw1,iw2,iw3
integer imo,itmp
real*8 et,ltol
integer iss,ittl,it1,laplminmem,iblocki,nblocki,xyzomp
integer iapbc,illjab,illjpai,tmpmemdgemm,abciblmemmax
logical abci_inmem,lapl
c times
real*8 ttmp(6,10),times(6,0:100)
integer list(100),ii,imax
character*80 txt(0:100)
integer nit, omp_get_max_threads
real*8 ecc_ppl_scaled, ecc_ppl, eppl_correction
real*8 et_corrected
interface ! {{{
c {{{ interfaces for pointers
subroutine rpoint1d(egydim1,egydim2,dim1)
implicit none
integer dim1
real*8,target :: egydim1(dim1)
real*8, pointer :: egydim2(:)
end subroutine
c}}}
end interface ! }}}
#ifdef OMP
xyzomp = omp_get_max_threads()
#else
xyzomp = 1
#endif
times(1:6,0:100)=0.d0
et=0.d0
ets=0.d0
tfact=(emp2full-eref)/(emp2-eref)
C Calculate (T) correction
write(iout,*)
write(iout,*) 'Calculation of (T) correction...'
write(iout,*)
call flush(iout)
TIME0(ttmp)
TIME0(times(1,96))
ovmax=max(nocc,nvirt)
lapl=.false.
if (talg.eq.'lapl'.or.talg.eq.'topr'.or.talg.eq.'to '.or.
$ talg.eq.'lato') lapl=.true.
if(nocc.eq.0 .or. nvirt.eq.0) then
tfact=0.d0
goto 123
end if
c
TIME0(times)
C {{{ Preparing memory, fock, T1 and T2
c common part
call dbldealloc(iscrs) ! has to match the end of memory where talgmem is calculated @ccsdpt
ifdo=dblalloc(nocc)
ifdv=dblalloc(nvirt)
C Sort diagonals of the fock matrix
do i=1,nocc
dcore(ifdo+i-1)=fij(i,i)
enddo
do a=1,nvirt
dcore(ifdv+a-1)=fab(a,a)
enddo
C decompressing t2 amplitudes
itnew=dblalloc(nvirt*nocc)
call dcopy(nvirt*nocc,t,1,dcore(ittend),1)
call dcopy(nvirt*nocc,dcore(ittend),1,dcore(itnew),1)
ittnew=dblalloc(nvirt**2*nocc**2) ! must immediately follow itnew in memory
call dcopy(nvirt**2*(nocc+1)*nocc/2,tt,1,dcore(ittend),1) ! save tt, ittnew may overwrite tt
ttind=0
do j=1,nocc
do i=1,j
ttind=ttind+1
call dcopy(nvirt**2,dcore(ittend+(ttind-1)*nvirt**2),1,
$dcore(ittnew+nvirt**2*(i-1)+nvirt**2*nocc*(j-1)),1)
call tr(nvirt, dcore(ittend+(ttind-1)*nvirt**2),
$dcore(ittnew+nvirt**2*(j-1)+nvirt**2*nocc*(i-1)))
enddo
enddo
c allocate remaining arrays
c talg=virt {{{
if(talg .eq. 'virt') then
!{{{
illjpai=imem
iIabIJ=imem
illjab=imem
iss=imem
ittl=imem
ittl2=imem
ittl2=imem
ittl3=imem
it1=imem
iabci=imem
abci_inmem=.true.
iapbc=imem
nblocki=1
iblocki=nocc-1
!}}}
jlen=nocc
if(ccsdalg .eq. 'dfdirect') then
call reorderdfints(dcore(ijij), dcore(ijab),
$ nocc, nvirt, dfnb, dcore(imove))
end if
iaijk=dblalloc(nvirt*nocc**3)
iabij=dblalloc(nvirt**2*nocc*jlen) ! abij only for j=1 if lapl
avmem=maxcor-(imem-imem1) ! for talg=occ
itrr=dblalloc(nvirt**2*nocc**2) ! store also rr=tt^traspose for the full tt
rrd=nocc
if(localcc.ne.'off ')then
iv1=dblalloc(nocc**2)
iv2=dblalloc(nocc**2)
iv3=dblalloc(nocc**2)
iw1=dblalloc(nocc**2)
iw2=dblalloc(nocc**2)
iw3=dblalloc(nocc**2)
endif
iscro=dblalloc(nocc**3)
iw=dblalloc(nocc**3)
iv=dblalloc(wsize)
end if
c }}}
c talg=occ {{{
if(talg .eq. 'occ ') then
!{{{
iv1=1
iv2=1
iv3=1
iw1=1
iw2=1
iw3=1
iscro=1
nblocki=1
iblocki=nocc-1
illjpai=imem
iIabIJ=imem
illjab=imem
iss=imem
ittl=imem
ittl2=imem
ittl2=imem
ittl3=imem
it1=imem
iapbc=imem
iabci=imem
abci_inmem=.true.
!}}}
itemp = dblalloc(nocc*nvirt)
jlen=nocc
if(ccsdalg .eq. 'dfdirect') then
call reorderdfints(dcore(ijij), dcore(ijab),
$ nocc, nvirt, dfnb, dcore(imove))
end if
iaijk=dblalloc(nvirt*nocc**3)
if (ccsdalg.eq.'dfdirect') then
talgoccmin=ptthreads*(nvirt*nocc*max(nvirt,nocc) +
$ 3*nvirt**3+wsize) + nvirt**3 !minimal memory to be allocated besides abij: min trr, w,v, min abci
if (talgoccmin+nocc**2*nvirt**2 .gt. maxcor-(imem-imem1))
$ jlen=1 ! full abij does not fit in memory
if (jlen.eq.1) iIabIJ=dblalloc(nvirt**2*ptthreads)
endif
iabij=dblalloc(nvirt**2*nocc*jlen) ! abij only for j=1 if lapl
avmem=maxcor-(imem-imem1) ! for talg=occ
call rralloc(avmem,nocc,nvirt,itrr,rrd,
$ nvirt**3 + ptthreads*(3*nvirt**3+wsize))
iw=dblalloc(nvirt**3*ptthreads)
vsize=wsize*ptthreads
iv=dblalloc(vsize)
endif
c }}}
c talg=lapl {{{
if(lapl) then
!{{{
iv1=1
iv2=1
iv3=1
iw1=1
iw2=1
iw3=1
iscro=1
!}}}
itemp = dblalloc(nocc*nvirt)
jlen=nocc
if(lccoporder.eq.'trffirst' .or. ccsdalg .eq. 'disk ')
$ imo=dblalloc(nocc**2) ! occ basis trf mx to the GSL basis
if(ccsdalg .eq. 'disk ') then
ijij=dblalloc(dfnb*nocc**2)
ijia=dblalloc(dfnb*nocc*nvirt)
ijab=dblalloc(dfnb*nvirt**2)
itmp=dblalloc(dfnb*max(nvirt*(nvirt+1)/2,nocc*nvirt, ! tmp array for readdfints
$ nocc*(nocc+1)/2))
else
itmp = imem
end if
call readdfints(dcore(ijij),dcore(ijia),dcore(ijab),dcore(imo),
$ nocc,nvirt,dfnb,dcore(itmp),dcore(itmp),dcore(itmp),
$ localcc,ccsdalg,lccoporder,dcore(imove))
call dbldealloc(itmp)
iaijk=dblalloc(max(nvirt*nocc**3,dfnb*nocc*ovmax)) ! also jij and ljapi in trf2laplbasis
if(localcc.ne.'off ') jlen=1
illjpai=dblalloc(dfnb*nvirt*nocc)
iIabIJ=dblalloc(ovmax**2*ptthreads)
illjab=ijab ! warning lljab overwrites jab
iabij=dblalloc(nvirt**2*nocc*jlen) ! abij only for j=1 if lapl
iss=ittnew
ittl=dblalloc(nvirt*nocc*ovmax)! Laplace second type of T2(-,~,-,-k) for k=1
ittl2=dblalloc(nvirt**2*nocc)! Laplace second type of T2(-,~,-k,-) for k=1
ittl3=dblalloc(nvirt**2*ptthreads)! Laplace second type of T2(-,~,-i,-j) for fixed i and j
it1=dblalloc(nocc*ovmax) ! an other for t1
iw=dblalloc(
$ max(nvirt**3*ptthreads,dfnb*nocc**2,nocc**2*nvirt)) ! for W and for jij, labc, ofock1, ttijb, ttbij in trf2laplbasis
vsize=max(wsize*ptthreads,dfnb)
iv=dblalloc(vsize)
!minimum memory for blocked abci: abci block for a single occ index & abc1 & Apbc & rr for k=1
laplminmem = 2*nvirt**3 + nvirt**2*(dfnb+nocc*ptthreads)
abcistoremin = nvirt**3*nocc + nvirt**2*nocc*ptthreads ! store full abci & rr(a,b,i,1)
if(localcc .eq. 'off ') then
ccmem(7)=imem+abcistoremin-imem1
else
ccmem(7)=imem+min(laplminmem,abcistoremin)-imem1
end if
abci_inmem=.false.
iapbc=imem
nblocki=1
iblocki=nocc-1
avmem=maxcor-(imem-imem1) ! for talg=occ
if (avmem.gt.abcistoremin.or.nocc.eq.1) then ! store full abci & rr(a,b,i,1) & w and v; elseif does not work for nocc=1
abci_inmem=.true.
iabci=dblalloc(nvirt**3*nocc)
avmem=maxcor-(imem-imem1)
call rralloc(avmem,nocc,nvirt,itrr,rrd,
$ (nvirt**3+wsize)*ptthreads)
elseif(avmem.gt.laplminmem.and..not.abci_inmem
$ .and.nocc.gt.1) then! store only smaller block of abci, does not work for nocc=1
tmpmemdgemm=xyzomp*(2*nvirt**3+nvirt**2) ! extra unallocated space for dgemms
write(iout,"(f12.2,
$' MB more memory is required for more CPU efficient execution')")
$(abcistoremin+tmpmemdgemm+nvirt**2*nocc*(nocc-1)-avmem)
$*8.d0/(1024.d0**2)
write(iout, *)
laplminmem=laplminmem-(nvirt**2*nocc)*ptthreads ! minus the space for minimal itrr
call rralloc(avmem,nocc,nvirt,itrr,rrd,
$ laplminmem+tmpmemdgemm)
iapbc=dblalloc(dfnb*nvirt**2)
avmem=maxcor-(imem-imem1)
c determine block size, iblock and num of blocks, nblocki
abciblmemmax=avmem-!laplminmem+nvirt**2*nocc*ptthreads-
$ tmpmemdgemm ! v**2*o*ptthreads for rr
if (floor(dble(abciblmemmax)/dble(nvirt**3)) .lt. 2)
$ abciblmemmax=abciblmemmax+tmpmemdgemm ! not enough extra mem space for dgemms
iblocki=min(floor(dble(abciblmemmax)/dble(nvirt**3)), nocc) ! width of blocks, must be < nocc (abc1 is fixed at 1st position)
if (iblocki .lt. min(2, nocc)) then
write(iout,*)
$ 'Insufficient memory for Laplace (T) correction',
$ iblocki
call mrccend(1)
endif
c nblocki=floor(dble(nocc-1)/dble(iblocki)) ! number of blocks, -1 due to fixed abc1
c if (iblocki*nblocki.lt.nocc) nblocki=nblocki+1 ! +1 for the last (smaller) block if needed
iabci=dblalloc(nvirt**3*iblocki)
else
write(iout,*)'Insufficient memory!',
$ (laplminmem-avmem)*8.d0/1024**3,
$ 'GB more memory is required'
call mrccend(1)
endif
if (.not.abci_inmem .and. localcc.eq.'off ') then
write(iout,*)
$ 'Full abci list is stored in memory for canonical',
$ 'Laplace transformed (T), insufficient memory!'
call mrccend(1)
endif
c if (localcc.ne.'off ') then
c write(iout,'(f12.2," MB memory is utilized for (T)")')
c $ (imem-imem1)*8.d0/1024**2
c ccmem(8)=imem-imem1
c endif
endif
c }}}
c!{{{
c if(talg.eq.'occ ' .or. lapl) itemp = dblalloc(nocc*nvirt)
c jlen=nocc
c
c if (lapl) then
c if(lccoporder.eq.'trffirst'.or.ccsdalg .eq. 'disk ')
c $ imo=dblalloc(nocc**2) ! occ basis trf mx to the GSL basis
c if(ccsdalg .eq. 'disk ') then
c ijij=dblalloc(dfnb*nocc**2)
c ijia=dblalloc(dfnb*nocc*nvirt)
c ijab=dblalloc(dfnb*nvirt**2)
c itmp=dblalloc(dfnb*max(nvirt*(nvirt+1)/2,nocc*nvirt, ! tmp array for readdfints
c $ nocc*(nocc+1)/2))
c else
c itmp = imem
c end if
c call readdfints(dcore(ijij),dcore(ijia),dcore(ijab),dcore(imo),
c $nocc,nvirt,dfnb,dcore(itmp),dcore(itmp),dcore(itmp),localcc,
c $ccsdalg,lccoporder,dcore(imove))
c call dbldealloc(itmp)
c iaijk=dblalloc(max(nvirt*nocc**3,dfnb*nocc*ovmax)) ! also jij and ljapi in trf2laplbasis
c if(localcc.ne.'off ') jlen=1
c illjpai=dblalloc(dfnb*nvirt*nocc)
c iIabIJ=dblalloc(ovmax**2*ptthreads)
c illjab=ijab ! warning lljab overwrites jab
c else
c if(ccsdalg .eq. 'dfdirect') then
c call reorderdfints(dcore(ijij), dcore(ijab),
c $ nocc, nvirt, dfnb, dcore(imove))
c end if
c iaijk=dblalloc(nvirt*nocc**3)
c illjpai=imem
c iIabIJ=imem
c illjab=imem
c endif
c if (talg.eq.'occ '.and.ccsdalg.eq.'dfdirect') then
cc talgoccmin=nvirt**2*nocc+nvirt**3+wsize !minimal memory to be allocated besides abij: min trr, w,v, min abci
c talgoccmin=ptthreads*(nvirt*nocc*max(nvirt,nocc) +
c $ 3*nvirt**3+wsize) + nvirt**3 !minimal memory to be allocated besides abij: min trr, w,v, min abci
c if (talgoccmin+nocc**2*nvirt**2.gt.maxcor-(imem-imem1)) jlen=1 ! full abij does not fit in memory
c if (jlen.eq.1) iIabIJ=dblalloc(nvirt**2*ptthreads)
c endif
c iabij=dblalloc(nvirt**2*nocc*jlen) ! abij only for j=1 if lapl
c if (lapl) then
c iss=ittnew
c ittl=dblalloc(nvirt*nocc*ovmax)! Laplace second type of T2(-,~,-,-k) for k=1
c ittl2=dblalloc(nvirt**2*nocc)! Laplace second type of T2(-,~,-k,-) for k=1
c ittl3=dblalloc(nvirt**2*ptthreads)! Laplace second type of T2(-,~,-i,-j) for fixed i and j
c it1=dblalloc(nocc*ovmax) ! an other for t1
c else
c iss=imem
c ittl=imem
c ittl2=imem
c ittl2=imem
c endif
c!minimum memory for blocked abci: abci block for a single occ index & abc1 & w & v & Apbc & rr for k=1
c if (lapl) then
c laplminmem=(2+ptthreads)*nvirt**3+wsize*ptthreads+
c $ nvirt**2*(dfnb+nocc)
c abcistoremin=nvirt**3*(nocc+ptthreads)+wsize*ptthreads+
c $ nvirt**2*nocc ! store full abci & rr(a,b,i,1) & w and v
c if(localcc .eq. 'off ') then
c ccmem(7)=imem+abcistoremin-imem1
c else
c ccmem(7)=imem+min(laplminmem,abcistoremin)-imem1
c end if
c endif
c abci_inmem=.true.
c if (lapl) abci_inmem=.false.
c iapbc=imem
c nblocki=1
c iblocki=nocc-1
c avmem=maxcor-(imem-imem1) ! for talg=occ
c if (talg.eq.'virt') then
c itrr=dblalloc(nvirt**2*nocc**2) ! store also rr=tt^traspose for the full tt
c rrd=nocc
c elseif (talg.eq.'occ ') then
c call rralloc(avmem,nocc,nvirt,itrr,rrd,
c $ nvirt**3 + ptthreads*(3*nvirt**3+wsize))
c elseif (lapl) then
cc write(iout,*) 'WARNING restore if/elseif for iblocki test'
c if (avmem.gt.abcistoremin.or.nocc.eq.1) then ! store full abci & rr(a,b,i,1) & w and v; elseif does not work for nocc=1
cc if (.false.) then
c abci_inmem=.true.
c iabci=dblalloc(nvirt**3*nocc)
c avmem=maxcor-(imem-imem1)
c call rralloc(avmem,nocc,nvirt,itrr,rrd,
c $ (nvirt**3+wsize)*ptthreads)
c elseif(avmem.gt.laplminmem.and..not.abci_inmem
c $ .and.nocc.gt.1) then! store only smaller block of abci, does not work for nocc=1
cc elseif(.true.) then
c tmpmemdgemm=xyzomp*(2*nvirt**3+nvirt**2) ! extra unallocated space for dgemms
c write(iout,"(f12.2,
c $' MB more memory is required for more CPU efficient execution')")
c $ (abcistoremin+tmpmemdgemm+nvirt**2*nocc*(nocc-1)-avmem)
c $ *8.d0/1024**2
c laplminmem=laplminmem-(nvirt**2*nocc)*ptthreads ! minus the space for minimal itrr
c call rralloc(avmem,nocc,nvirt,itrr,rrd,
c $ laplminmem+tmpmemdgemm)
c avmem=maxcor-(imem-imem1)
cc determine block size, iblock and num of blocks, nblocki
c abciblmemmax=avmem-laplminmem+nvirt**2*nocc*ptthreads-
c $ tmpmemdgemm ! v**2*o*ptthreads for rr
c if (floor(dble(abciblmemmax)/dble(nvirt**3)).lt.
c $ (1.d0+ptthreads))
c $ abciblmemmax=abciblmemmax+tmpmemdgemm ! not enough extra mem space for dgemms
c iblocki=min(floor(dble(abciblmemmax)/dble(nvirt**3)),
c $ max(nocc-1,1)) ! width of blocks, must be < nocc (abc1 is fixed at 1st position)
cc write(*,*) 'tmpmemdgemm,abciblmemmax,avmem,laplminmem,iblocki'! dbprt xxx
cc write(*,*) tmpmemdgemm,abciblmemmax,avmem,laplminmem,iblocki ! dbprt xxx
cc write(*,*) dble(abciblmemmax)/dble(nvirt**3) ! dbprt xxx
cc flush(6)
c if (iblocki+1.lt.2) then
c write(iout,*)'Insufficient memory for Laplace (T) correction'
c $ ,iblocki
c call mrccend(1)
c endif
c nblocki=floor(dble(nocc-1)/dble(iblocki)) ! number of blocks, -1 due to fixed abc1
c if (1+iblocki*nblocki.lt.nocc) nblocki=nblocki+1 ! +1 for the last (smaller) block if needed
c iapbc=dblalloc(dfnb*nvirt**2)
c iabci=dblalloc(nvirt**3*(iblocki+1)) ! +1: also abc1 is kept in this array
c else
c write(iout,*)'Insufficient memory!',
c $(laplminmem-avmem)*8.d0/1024**3,'GB more memory is required'
c call mrccend(1)
c endif
c endif ! talg.eq.
cc write(*,*) 'full abci,min',(imem-imem1+nvirt**3*nocc)*8./1024**3,
cc $laplminmem*8.d0/1024**3
cc write(*,*) 'nblocki,iblocki,nocc',nblocki,iblocki
cc write(*,*) 'abci_inmem',abci_inmem,nocc
c if (.not.abci_inmem.and.localcc.eq.'off ') then
c write(iout,*)'Full abci list is stored in memory for canonical',
c $ 'Laplace transformed (T), insufficient memory!'
c call mrccend(1)
c endif
c if(localcc.ne.'off '.and.talg.eq.'virt')then
c iv1=dblalloc(nocc**2)
c iv2=dblalloc(nocc**2)
c iv3=dblalloc(nocc**2)
c iw1=dblalloc(nocc**2)
c iw2=dblalloc(nocc**2)
c iw3=dblalloc(nocc**2)
c else
c iv1=1
c iv2=1
c iv3=1
c iw1=1
c iw2=1
c iw3=1
c endif
c if(talg.eq.'virt')then
c iscro=dblalloc(nocc**3)
c elseif(talg.eq.'lapl'.or.talg.eq.'occ '.or.lapl) then
c iscro=1
c endif
c if(talg.eq.'virt')then
c iw=dblalloc(nocc**3)
c iv=dblalloc(wsize)
c elseif(talg.eq.'lapl'.or.talg.eq.'occ '.or.lapl) then
c if (talg.eq.'occ ') then
c iw=dblalloc(nvirt**3*ptthreads)
c else if (talg.eq.'lapl'.or.lapl) then
c iw=dblalloc(
c $max(nvirt**3*ptthreads,dfnb*nocc**2,nocc**2*nvirt)) ! for W and for jij, labc, ofock1, ttijb, ttbij in trf2laplbasis
c end if
c if (talg.eq.'lapl'.or.lapl) then
c vsize=max(wsize*ptthreads,dfnb)
c else if(talg .eq. 'occ ') then
c vsize=wsize*ptthreads
c else if(talg .eq. 'virt') then
c vsize=wsize
c end if
c iv=dblalloc(vsize)
c endif
c if (localcc.ne.'off '.and.talg.eq.'lapl') then
c write(iout,'(f12.2," MB memory is utilized for (T)")')
c $ (imem-imem1)*8.d0/1024**2
c ccmem(8)=imem-imem1
c endif
cc }}}!}}}
TIMEADD(times(1,24),times)
c
c evaluate (T) energy
if(talg.eq.'virt')then
call ttsort(dcore(ittnew),dcore(iabij),nocc,nvirt)
call dcopy(nocc*nvirt,dcore(itnew),1,dcore(iabij),1)
call gtrans(dcore(iabij),dcore(itnew),nvirt,nocc) !transpose t1
c if(localcc.eq.'off ')then
C Calculate (T) correction, virtual loops out
call localbtcorr(dcore(itnew),dcore(ittnew),dcore(ifdo),
$ dcore(ifdv),dcore(iaijk),dcore(iabij),dcore(iscro),
$ dcore(iw),dcore(iv),bsymmv,symmocc,dgroup,first,last,co,
$ localcc,dcore(iv1),dcore(iv2),dcore(iv3),dcore(iw1),
$ dcore(iw2),dcore(iw3),ui,et,times,dcore(itrr),ccsdalg,
$ dcore(ijij),dcore(ijia),dcore(ijab),dfnb,irecln,ibufln,
$ nbasis,nocc,nvirt,nirmax,multpg,imem,iout,maxcor,dcore)
c elseif(lapl) then
c if (talg.eq.'lapl'.or.talg.eq.'lato') then
cC Get Laplace quadrature
c call getkey('laptol',6,laptol,16)
c read(laptol,*) ltol
c nqmin=1
c if (ltol.lt.1.d0) nqmin=floor(-dlog10(ltol))+1
c write(iout,"(' Laplace threshold, nquadmin:',es10.2,i4)")
c $ ltol,nqmin
c call laplace((nvirt*nocc)**3,nquad,dcore(imem),dcore(ifdo),
c $dcore(ifdv),nvirt,nocc,dcore(ifdo),dcore(ifdv),0,0,3,3,ltol,
c $gbasfile,iout,dcore(imem),nqmin,0.d0,0,dcore(imem))
c iquad=dblalloc(nquad*(nvirt+nocc))
c write(iout,"(1x,a29,i4)")'Number of quadrature points: ',nquad
c else
c iquad=imem
c nquad=1
c endif
cC Calculate local (T) correction
c call localbtcorr_lap(dcore(itnew),dcore(ittnew),dcore(ifdo),
c $dcore(ifdv),dcore(iaijk),dcore(iabij),dcore(iw),dcore(iv),
c $bsymmv,symmocc,dgroup,first,last,co,imem,dcore,
c $et,nquad,dcore(iquad),dcore(it1),dcore(iabci),dcore(ittl),
c $dcore(iss),dcore(itrr),rrd,abci_inmem,times,dcore(imo),
c $dcore(ijij),dcore(ijia),dcore(ijab),dfnb,localcc,talg,
c $dcore(illjab),dcore(illjpai),dcore(iapbc),nblocki,iblocki,
c $dcore(iv),dcore(iIabIJ),dcore(ittl2),dcore(ittl3),ccsdalg,
c $dcore(ijia),
c $irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg,iout,
c $mpi_rank,master_thread,mpi_size,mpi_ccsd,ccsdmkl,ptthreads,omp_max_threads,
c $ccsd_communicator)
elseif(talg.eq.'occ ' .or. lapl) then
if (talg.eq.'lapl'.or.talg.eq.'lato') then
C Get Laplace quadrature
call getkey('laptol',6,laptol,16)
read(laptol,*) ltol
nqmin=1
if (ltol.lt.1.d0) nqmin=floor(-dlog10(ltol))+1
write(iout,"(' Laplace threshold, nquadmin:',es10.2,i4)")
$ ltol,nqmin
call laplace((nvirt*nocc)**3,nquad,dcore(imem),dcore(ifdo),
$dcore(ifdv),nvirt,nocc,dcore(ifdo),dcore(ifdv),0,0,3,3,ltol,
$gbasfile,iout,dcore(imem),nqmin,0.d0,0,dcore(imem))
iquad=dblalloc(nquad*(nvirt+nocc))
write(iout,"(1x,a29,i4)")'Number of quadrature points: ',
$ nquad
ccmem(7) = ccmem(7) + nquad*(nvirt+nocc)
c write(iout,'(f12.2,
c $ " MB is the minimal memory requirement for (T)")')
c $ ccmem(7)*8.d0/1024**2
c write(iout,'(f12.2," MB memory is utilized for (T)")')
c $ (imem-imem1)*8.d0/1024**2
ccmem(8)=imem-imem1
else
iquad=imem
nquad=1
endif
C Calculate (T) correction, occupied loops outside
call btcorr(dcore(itnew),dcore(ittnew),dcore(ifdo),
$ dcore(ifdv),dcore(iaijk),dcore(iabij),dcore(iw),
$ dcore(iv),bsymmv,symmocc,dgroup,first,last,co,et,times,
$ dcore(itrr),rrd,ccsdalg,dcore(ijij),dcore(ijia),
$ dcore(ijab),dfnb,jlen,dcore(iIabIJ),nquad,dcore(iquad),
$ dcore(it1),dcore(iabci),dcore(iabci+nvirt**3),
$ dcore(ittl),dcore(iss),abci_inmem,dcore(imo),localcc,
$ talg,dcore(illjab),dcore(illjpai),dcore(iapbc),nblocki,
$ iblocki,dcore(iv),dcore(ittl2),dcore(ittl3), lapl,irecln,
$ ibufln,nbasis,nocc,nvirt,nirmax,multpg, iout,maxcor,imem,
$ dcore,iimem,icore,dcore(itemp),mpi_rank,mpi_size,
$ mpi_ccsd,ccsdmkl, ptthreads, omp_max_threads,
$ mkl_max_threads, ccsdrest, ptrest, bcast_comm,
$ ccsd_communicator, lno, lnaf, ets, master_rank,
$ master_thread,lf12,tfact)
call dbldealloc(itemp)
endif
123 continue
if(localcc.ne.'off ')then
write(iout,*)
write(iout,'(" (T) correction [au]: ",f22.12)') et
write(iout,'(" CCSD(T) correlation energy [au]:",f22.12)')
$ et+esdi
write(iout,*)
call prtenergc('CCSD(T) ', et+esdi, eref, locnoc)
else
call getkey('dfintran',8,dfintran,8)
write(iout,*)
if(lf12) then
write(iout,'(" (T) correction [au]: ",
$ f22.12)') et
write(iout,'(" (T*) correction [au]: ",
$ f22.12)') et*tfact
write(iout,'(" (T+) correction [au]: ",
$ f22.12)') ets
write(iout,'(" CCSD(F12*)(T) correlation energy [au]:",
$ f22.12)') et+ecc-eref
write(iout,'(" CCSD(F12*)(T*) correlation energy [au]:",
$ f22.12)') tfact*et+ecc-eref
write(iout,'(" CCSD(F12*)(T+) correlation energy [au]:",
$ f22.12)') ets+ecc-eref
if((lno .or. lnaf) .and. dfintran.ne.'ovirt ') then
write(iout,'(" CCSD(F12*)(T) corr. energy + MP2 [au]:",
$ f22.12)') et+ecc-eref+emp2corr
write(iout,'(" CCSD(F12*)(T*) corr. energy + MP2 [au]:",
$ f22.12)') tfact*et+ecc-eref+emp2corr
write(iout,'(" CCSD(F12*)(T+) corr. energy + MP2 [au]:",
$ f22.12)') ets+ecc-eref+emp2corr
endif
write(iout,'(" Total CCSD(F12*)(T) energy [au]: ",
$ f22.12)') et+ecc
write(iout,'(" Total CCSD(F12*)(T*) energy [au]: ",
$ f22.12)') tfact*et+ecc
write(iout,'(" Total CCSD(F12*)(T+) energy [au]: ",
$ f22.12)') ets+ecc
if((lno .or. lnaf) .and. dfintran.ne.'ovirt ') then
write(iout,'(" Total CCSD(F12*)(T) energy + MP2 [au]:",
$ f22.12)') et+ecc+emp2corr
write(iout,'(" Total CCSD(F12*)(T*) energy + MP2 [au]:",
$ f22.12)') tfact*et+ecc+emp2corr
write(iout,'(" Total CCSD(F12*)(T+) energy + MP2 [au]:",
$ f22.12)') ets+ecc+emp2corr
endif
et=ets
et_corrected = et + ecc
if((lno .or. lnaf) .and. dfintran.ne.'ovirt ')
$ et_corrected = et_corrected+emp2corr
else
write(iout,
$ '(" (T) correction [au]: ",
$ f22.12)') et
if((lno .or. lnaf) .and. dfintran.ne."ovirt ") then
write(iout,
$ '(" (T*) correction [au]: ",
$ f22.12)') et*tfact
write(iout,
$ '(" (T+) correction [au]: ",
$ f22.12)') ets
end if
write(iout,
$ '(" CCSD(T) correlation energy [au]: ",
$ f22.12)') et+ecc-eref
if((lno .or. lnaf) .and. dfintran.ne."ovirt ") write(iout,
$ '(" CCSD(T) correlation energy + MP2 correction [au]:",
$ f22.12)') et+ecc-eref+emp2corr
if(ppl_calc) then !(lno .or. lnaf) .and. dfintran.ne."ovirt ") then
write(iout,
$ '(" CCSD(T+) correlation en. + MP2 + PPL corr. [au]: ",
$ f22.12)') ets+ecc-eref+emp2corr+eppl_correction
else if((lno .or. lnaf) .and. dfintran.ne."ovirt ") then
write(iout,
$ '(" CCSD(T+) correlation en. + MP2 correction [au]: ",
$ f22.12)') ets+ecc-eref+emp2corr
end if
write(iout,
$ '(" Total CCSD(T) energy [au]: ",
$ f22.12)') et+ecc
et_corrected = et + ecc
if((lno .or. lnaf) .and. dfintran.ne."ovirt ")
$ write(iout,
$ '(" Total CCSD(T) energy + MP2 correction [au]: ",
$ f22.12)') et+ecc+emp2corr
et_corrected = et + ecc + emp2corr
if(ppl_calc) then !(lno .or. lnaf) .and. dfintran.ne."ovirt ") then
write(iout,
$ '(" Total CCSD(T+) energy + MP2 + PPL corr. [au]: ",
$ f22.12)') ets+ecc+emp2corr+eppl_correction
et_corrected = ets+ecc+emp2corr+eppl_correction
else if((lno .or. lnaf) .and. dfintran.ne."ovirt ") then
write(iout,
$ '(" Total CCSD(T+) energy + MP2 correction [au]: ",
$ f22.12)') ets+ecc+emp2corr
et_corrected = ets+ecc+emp2corr
endif
write(iout,*)
c et_corrected = et + ecc - eref
end if ! lf12
c call prtenergc('CCSD(T) ', et+ecc-eref, eref, locnoc)
call prtenergc('CCSD(T) ',
c $ et+ecc-eref+emp2corr+eppl_correction, eref, locnoc)
$ et_corrected-eref, eref, locnoc)
endif
call flush(iout)
TIMEADD(times(1,23),ttmp)
c timing data
# if defined(MPI)
if(mpi_ccsd .and. mpi_size .ne. 1)
$ call MPI_Barrier(ccsd_communicator, mpi_err)
#endif
if (WRITE_TIMES) then
times(:,12:20) = times(:,12:20) / ptthreads
times(:,27:28) = times(:,27:28) / ptthreads
times(:,37:55) = times(:,37:55) / ptthreads
times(:,57:64) = times(:,57:64) / ptthreads
write(iout,*)
write(iout,"(1x,70('*'))")
write(iout,'(a94)') ' Time of manipulations:' //
$ '1 it. CPU/Wall [min], tot CPU/Wall [min], CPU/Wall ratio, tot %'
write(iout,*)
txt(12)='" total abci in (T) "'
txt(13)='" i1=i2 total abci*T2 in (T) "'
txt(14)='" i1=i2 total ijka*T2 in (T) "'
txt(15)='" i1=i3 total abci*T2 in (T) "'
txt(16)='" i1=i3 total ijka*T2 in (T) "'
txt(17)='" not = total abci*T2 in (T) "'
txt(18)='" not = total ijka*T2 in (T) "'
txt(19)='" total V constr. in (T) "'
txt(20)='" total E evaluation in (T) "'
txt(21)='" total abci*T2 in (T) "'
txt(22)='" total ijka*T2 in (T) "'
txt(23)='" total (T) correction "'
txt(24)='" T2 "'
txt(25)='" abck "'
txt(26)='" ptindices "'
txt(27)='" W & V "'
txt(28)='" communication "'
txt(29)='" test_iprobe "'
txt(30)='" cancel_after while "'
txt(31)='" isend_recv "'
times(3:4,21)=times(3:4,13)+times(3:4,15)+times(3:4,17)
times(3:4,22)=times(3:4,14)+times(3:4,16)+times(3:4,18)
imax=31
list(1:imax)=(/(i,i=1,imax)/)
do ii=12, 31 ! 1,imax
i=list(ii)
ttmp(1,1)=times(4,6)
if (i.ge.12) then
nit=1
ttmp(1,1)=times(4,23)
end if
write(txt(0),*) trim(txt(i))
write(iout,"(" // txt(0) // ",1x,7f10.3)")
$ times(3, i)/nit/60.d0, times(4, i)/nit/60.d0,
$ times(3, i)/60.d0, times(4, i)/60.d0,
$ times(3, i)/times(4, i), times(4, i)/ttmp(1, 1)*100.d0
enddo
c (T) fine grain timing
ttmp(1,1)=times(4,23)
txt(37)='" abci*T2 i = j 37 b <ba|di>*Tkjcd "'
txt(38)='" abci*T2 i = j 38 c <ca|di>*Tjkbd "'
txt(39)='" abci*T2 i = j 39 a <ac|dk>*Tijbd "'
txt(40)='" abci*T2 j = k 40 c <ca|di>*Tjkbd "'
txt(41)='" abci*T2 j = k 41 a <ac|dk>*Tijbd "'
txt(42)='" abci*T2 j = k 42 e <cb|dj>*Tikad "'
txt(43)='" abci*T2 not = 43 a <ac|dk>*Tijbd "'
txt(44)='" abci*T2 not = 44 b <ba|di>*Tkjcd "'
txt(45)='" abci*T2 not = 45 c <ca|di>*Tjkbd "'
txt(46)='" abci*T2 not = 46 d <bc|dk>*Tijad "'
txt(47)='" abci*T2 not = 47 e <cb|dj>*Tikad "'
txt(48)='" abci*T2 not = 48 f <ab|dj>*Tkicd "'
txt(49)='" ijka*T2 not = 49 perm "'
txt(50)='" ijka*T2 not = 50 a <cj|kl>*Tilab "'
txt(51)='" ijka*T2 not = 51 b <bk|jl>*Tilac "'
txt(52)='" ijka*T2 not = 52 c <bi|jl>*Tklac "'
txt(53)='" ijka*T2 not = 53 d <ak|il>*Tjlbc "'
txt(54)='" ijka*T2 not = 54 e <aj|il>*Tklcb "'
txt(55)='" ijka*T2 not = 55 f <ci|kl>*Tjlba "'
txt(56)='" ijka*T2 not = 56 sum "'
txt(57)='" ijka*T2 i = j 57 b <bk|jl>*Tilac "'
txt(58)='" ijka*T2 i = j 58 c <bi|jl>*Tklac "'
txt(59)='" ijka*T2 i = j 59 a <cj|kl>*Tilab "'
txt(60)='" ijka*T2 j = k 60 b <bk|jl>*Tilac "'
txt(61)='" ijka*T2 j = k 61 c <bi|jl>*Tklac "'
txt(62)='" ijka*T2 j = k 62 d <ak|il>*Tjlbc "'
txt(63)='" ijka*T2 i = j 63 perm "'
txt(64)='" ijka*T2 j = k 64 perm "'
c do ii=0,9
c i=40+ii
cc write(txt(i),"(" // txt(99) // ",1x,1i3)") ii+1
c if (ii.le.8) times(3:4,49)=times(3:4,49)+times(3:4,i)
c write(txt(0),*) trim(txt(i))
c write(iout,"(" // txt(0) // ",1x,7f10.3)")
c $ times(3,i)/nit/60.d0,times(4,i)/nit/60.d0,
c $ times(3,i)/60.d0,times(4,i)/60.d0,times(3,i)/times(4,i),
c $ times(4,i)/ttmp(1,1)*100.d0
c enddo
times(3:4,56)=0.d0
do ii=-13,14
i=50+ii
if(i.eq.56) cycle
if (ii.le.5) times(3:4,56)=times(3:4,56)+times(3:4,i)
write(txt(0),*) trim(txt(i))
write(iout,"(" // txt(0) // ",1x,7f10.3)")
$ times(3, i)/nit/60.d0, times(4, i)/nit/60.d0,
$ times(3, i)/60.d0, times(4, i)/60.d0,
$ times(3, i)/times(4, i), times(4, i)/ttmp(1, 1)*100.d0
enddo
endif ! timing data
if(localcc.ne.'off '.and.talg.eq.'virt')then
open(722,file='iabc')
close(722,status="delete")
endif
c call dbldealloc(iui)
return
end subroutine !}}}
c
c {{{ localbtcorr
************************************************************************
subroutine localbtcorr(t,tt,fdo,fdv,ijka,ijab,scro,w,v,
$bsymmv,symmocc,dgroup,first,last,co,localcc,
$v1,v2,v3,w1,w2,w3,ui,et,times,rr,ccsdalg,jij,jia,jab,dfnb,
$irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg,imem,iout,
$maxcor,dcore)
************************************************************************
implicit none
integer irecln,ibufln,nbasis,nocc,nvirt,dgroup,nirmax,imem,iout
integer bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
integer maxcor
real*8 dcore(*)
integer first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
integer co(nbasis)
real*8 t(nocc,nvirt),tt(nocc,nocc,nvirt,nvirt)
real*8 rr(nocc,nocc,nvirt,nvirt)
real*8 fdv(nvirt),fdo(nocc)
real*8 et,sum,y,z
integer a,b,c,i,j,k
real*8 w(nocc,nocc,nocc),v(nocc,nocc,nocc)
real*8 delta,delta1,delta2,delta3
real*8 fcb,fcba,fkj,fkji
real*8 ijab(nocc,nocc,nvirt,nvirt)
real*8 ijka(nocc,nocc,nocc,nvirt)
real*8 scro(nocc,nocc,nocc)
real*8 w1(nocc,nocc),w2(nocc,nocc),w3(nocc,nocc)
real*8 v1(nocc,nocc),v2(nocc,nocc),v3(nocc,nocc)
real*8 ui(nocc),pr,prold,dcba
cnp real*8 cut !NP
logical notend
integer ind,n,avmem,la,lb,astart,istart,prk,cba
integer imem1
common/memcom/imem1
character*4 localcc
character*8 ccsdalg
integer dfnb, ic
real*8 jij(dfnb, nocc, nocc), jia(nocc, nvirt, dfnb)
real*8 jab(dfnb, nvirt, nvirt)
real*8 scr(nvirt, nocc), scr1o(nocc), scr1v(nvirt)
real*8, pointer :: iab_c(:,:,:,:), ibc_ofa(:,:,:), ica_ofb(:,:,:)
C !NP
real*8 times(6,0:100)
interface
c {{{ interfaces for pointers
subroutine rpoint3d(egydim,haromdim,dim1,dim2,dim3)
implicit none
integer dim1,dim2,dim3
real*8,target :: egydim(dim1,dim2,dim3)
real*8, pointer :: haromdim(:,:,:)
end subroutine
subroutine rpoint4d(egydim,haromdim,dim1,dim2,dim3,dim4)
implicit none
integer dim1,dim2,dim3,dim4
real*8,target :: egydim(dim1,dim2,dim3,dim4)
real*8, pointer :: haromdim(:,:,:,:)
end subroutine
c}}}
end interface
cnp character*8 tvirtcut,cscr !NP
cnp call getkey('tvirtcut',8,tvirtcut,8)
cnp read(tvirtcut,*) cut !NP
cnp cut=cut/100.d0
cnp if (cut.lt.1.d0) then
cnp write(cscr,'(f8.1)') (1.d0-cut)*100.d0
cnp write(iout,*)
cnp write(iout,*) 'Highest lying' // trim(cscr) // '% of virtual MOs'
cnp $ // ' are excluded from (T) correction'
cnp write(iout,*)
cnp endif
!NP
c
cc TIME0(times)
c transpose T2 to R
do a=1,nvirt
do b=1,nvirt
call dcopy(nocc**2,tt(1:nocc,1:nocc,a,b),1,
$ rr(1:nocc,1:nocc,b,a),1)
enddo
enddo
if(ccsdalg .eq. 'disk ') then
call ijabread(ijab,bsymmv,symmocc,dgroup,first,last,co,v,
$ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
call ijkaread(ijka,bsymmv,symmocc,dgroup,first,last,co,v,
$ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
call sortabci(bsymmv,symmocc,dgroup,co,first,last,v,
$ maxcor,imem,imem1,dcore,nocc,nvirt,nbasis,iout,nirmax,
$ irecln,ibufln,multpg)
else if(ccsdalg .eq. 'dfdirect') then
! (ia|jb) = <ij|ab>
call dgemm('n', 't', nocc*nvirt, nocc*nvirt, dfnb,
$ 1.d0, jia, nocc*nvirt, jia, nocc*nvirt,
$ 0.d0, ijab, nocc*nvirt) ! store (ia|jb) in ijab
! iajb -> ijab
do b = 1, nvirt
do i = 1, nocc
call dcopy(nvirt*nocc, ijab(i, 1, 1, b), nocc,
$ scr, 1)
do j = 1, nocc
ijab(i, j, 1:nvirt, b) = scr(1:nvirt, j)
end do
end do
end do
! (ik|ja) = <ij|ka>
call dgemm('t', 't', nocc**2, nocc*nvirt, dfnb,
$ 1.d0, jij, dfnb, jia, nocc*nvirt,
$ 0.d0, ijka, nocc**2)
! ikja -> ijka
do a = 1, nvirt
do i = 1, nocc
do k = 1, nocc
scr1o(k+1:nocc) = ijka(i, k+1:nocc, k, a)
ijka(i, k+1:nocc, k, a) = ijka(i, k, k+1:nocc, a)
ijka(i, k, k+1:nocc, a) = scr1o(k+1:nocc)
end do
end do
end do
end if
cc TIMEADD(times(1,12),times)
C Construct W Eq.4
et=0.d0
ind=1
notend=.true.
prk=0
pr=0.d0
prold=0.d0
c all cba combination:
cba=0
do k=1,nvirt
do i=k+1,nvirt ! j=k
cba=cba+1
enddo
do j=k+1,nvirt ! j>k
do i=j,nvirt
cba=cba+1
enddo
enddo
enddo
c dnvirt=dble(nvirt)
dcba=dble(cba)
do while(notend)
avmem=maxcor-(imem-imem1)
if(avmem.ge.3*nvirt**2*nocc)then
n=(avmem-modulo(avmem,nvirt**2*nocc))/(nvirt**2*nocc)
else
write(iout,*)'Insufficient memory for <ia|bc> block!'
call mrccend(1)
endif
c
if (ind.eq.1) then
write(iout,'(f12.2," MB memory is utilized for (T)")')
$ (imem+n*nvirt**2*nocc-imem1)*8.d0/1024**2
write(iout,'(" 1% done.")')
endif
c
if(ind+n-1.ge.nvirt)then
notend=.false.
n=nvirt-ind+1
endif
if(ind+n-1.ne.nvirt)then
n=n-2
endif
call rpoint4d(dcore(imem), iab_c, nocc, nvirt, nvirt, n)
if (notend) then
call rpoint3d(dcore(imem+nocc*nvirt**2*n), ica_ofb,
$ nocc, nvirt, nvirt)
call rpoint3d(dcore(imem+nocc*nvirt**2*(n+1)), ibc_ofa,
$ nocc, nvirt, nvirt)
endif
cc TIME0(times)
if(ccsdalg .eq. 'disk ') then
c call iabcread(n,ind,dcore(imem),v,nocc,nvirt,irecln,ibufln)
call iabcread(n,ind,iab_c,v,nocc,nvirt,irecln,ibufln)
else if(ccsdalg .eq. 'dfdirect') then
! (ib|ac) = <ia|bc>
call dgemm('n', 'n', nocc*nvirt, nvirt*n, dfnb,
$ 1.d0, jia, nocc*nvirt, jab(1, 1, ind), dfnb,
$ 0.d0, iab_c, nocc*nvirt)
! ibac -> iabc
do c = 1, n
do i = 1, nocc
do b = 1, nvirt
scr1v(b+1:nvirt) = iab_c(i, b+1:nvirt, b, c)
iab_c(i,b+1:nvirt,b,c) = iab_c(i,b,b+1:nvirt,c)
iab_c(i, b, b+1:nvirt, c) = scr1v(b+1:nvirt)
end do
end do
end do
end if
cc TIMEADD(times(1,12),times)
c
do c=ind,ind+n-1
pr=dble(prk)/dcba
if(pr.ge.prold+0.1d0.and.pr.ne.1.d0)then
prold=pr
write(iout,'(i4,"% done.")')nint(pr*100)
endif
cnp if (c.gt.cut*nvirt) cycle !NP since a>=b>=c; a,b,c>cut*nvirt
do b=c,nvirt
if(b.gt.ind+n-1)then
lb=n+1
cc TIME0(times)
if(ccsdalg .eq. 'disk ') then
c call iabcread(1,b,dcore(imem+(lb-1)*nocc*nvirt**2),v,
c $ nocc,nvirt,irecln,ibufln)
call iabcread(1,b,ica_ofb,v,nocc,nvirt,irecln,ibufln)
else
! (ia|cb) = <ic|ab>
call dgemm('n', 'n', nocc*nvirt, nvirt, dfnb,
$ 1.d0, jia, nocc*nvirt, jab(1, 1, b), dfnb,
$ 0.d0, ica_ofb, nocc*nvirt)
! iacb -> icab
do i = 1, nocc
do a = 1, nvirt
scr1v(a+1:nvirt) = ica_ofb(i, a+1:nvirt, a)
ica_ofb(i, a+1:nvirt, a) = ica_ofb(i, a, a+1:nvirt)
ica_ofb(i, a, a+1:nvirt) = scr1v(a+1:nvirt)
end do
end do
end if
cc TIMEADD(times(1,12),times)
else
lb=b-ind+1
endif
delta1=1.d0
if(b.eq.c)delta1=delta1+1.d0
fcb=fdv(c)+fdv(b)
if(b.eq.c)then !a=b=c => no contribution!
astart=b+1
else
astart=b
endif
cnp if (b.gt.cut*nvirt) cycle !NP since a>=b; b,c>cut*nvirt
do a=astart,nvirt
c do a=b,nvirt
la=a-ind+1
if(a.gt.ind+n-1.and.a.ne.b)then
la=n+2
cc TIME0(times)
if(ccsdalg .eq. 'disk ') then
c call iabcread(1,a,dcore(imem+(la-1)*nocc*nvirt**2),v,nocc,nvirt,irecln,ibufln)
call iabcread(1,a,ibc_ofa,v,nocc,nvirt,irecln,ibufln)
else
! (ic|ba) = <ib|ca>
call dgemm('n', 'n', nocc*nvirt, nvirt, dfnb,
$ 1.d0, jia, nocc*nvirt, jab(1, 1, a), dfnb,
$ 0.d0, ibc_ofa, nocc*nvirt)
! icba -> ibca
do i = 1, nocc
do ic = 1, nvirt
scr1v(ic+1:nvirt) = ibc_ofa(i, ic, ic+1:nvirt)
ibc_ofa(i, ic, ic+1:nvirt) =
$ ibc_ofa(i, ic+1:nvirt, ic)
ibc_ofa(i, ic+1:nvirt, ic) = scr1v(ic+1:nvirt)
end do
end do
end if
cc TIMEADD(times(1,12),times)
endif
if(a.gt.ind+n-1.and.a.eq.b)then
la=lb
endif
delta=delta1
if(a.eq.b)delta=delta+1.d0
fcba=fcb+fdv(a)
c
c {{{ inactive part: build w with loops
c do k=1,nocc
c do j=1,nocc
c do i=1,nocc
c sum=0.d0
c sum2=0.d0
c do d=1,nvirt
c sum=sum+iabc(i,d,a,b)*tt(k,j,c,d)
c sum=sum+iabc(i,d,a,c)*tt(j,k,b,d)
c sum=sum+iabc(k,d,c,a)*tt(j,i,b,d)
c sum=sum+iabc(k,d,c,b)*tt(i,j,a,d)
c sum=sum+iabc(j,d,b,c)*tt(i,k,a,d)
c sum=sum+iabc(j,d,b,a)*tt(k,i,c,d)
c enddo
c do l=1,nocc
c sum=sum-ijka(j,k,l,c)*tt(i,l,a,b)
c sum=sum-ijka(k,j,l,b)*tt(i,l,a,c)
c sum=sum-ijka(i,j,l,b)*tt(k,l,c,a)
c sum=sum-ijka(j,i,l,a)*tt(k,l,c,b)
c sum=sum-ijka(k,i,l,a)*tt(j,l,b,c)
c sum=sum-ijka(i,k,l,c)*tt(j,l,b,a)
c enddo
c w(i,j,k)=sum
c enddo
c enddo
c enddo
c }}}
IF(a.eq.b)THEN
c {{{ a.eq.b OR b.eq.c
cc TIME0(times)
call dgemm('n','t',nocc,nocc**2,nvirt,1.d0,dcore(imem+
$(a-1)*nocc*nvirt+(la-1)*nocc*nvirt**2),nocc,
$tt(1,1,1,c),nocc**2,0.d0,v,nocc)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,a),nocc**2,
$dcore(imem+
$(a-1)*nocc*nvirt+(c-ind)*nocc*nvirt**2),nocc,0.d0,scro,nocc**2)
do k=1,nocc
call daxpy(nocc**2,1.d0,scro(k,1,1),nocc,v(1,1,k),1)
enddo
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,a),nocc**2,
$dcore(imem+
$(c-1)*nocc*nvirt+(la-1)*nocc*nvirt**2),nocc,1.d0,v,nocc**2)
cc TIMEADD(times(1,13),times)
cc TIME0(times)
C 2.tag
call dgemm('n','t',nocc,nocc**2,nocc,-1.d0,tt(1,1,a,a),nocc,
$ijka(1,1,1,c),nocc**2,1.d0,v,nocc)
call dgemm('n','n',nocc,nocc**2,nocc,-1.d0,tt(1,1,a,c),nocc,
$ijka(1,1,1,a),nocc,1.d0,v,nocc)
call dgemm('n','t',nocc**2,nocc,nocc,-1.d0,ijka(1,1,1,a),nocc**2,
$tt(1,1,c,a),nocc,1.d0,v,nocc**2)
call dcopy(nocc**3,v,1,w,1)
do k=1,nocc
call tradd(nocc,1.d0,v(1,1,k),w(1,1,k))
enddo
cc TIMEADD(times(1,14),times)
ELSE IF(b.eq.c)THEN
cc TIME0(times)
call dgemm('n','t',nocc,nocc**2,nvirt,1.d0,dcore(imem+
$(a-1)*nocc*nvirt+(lb-1)*nocc*nvirt**2),nocc,
$tt(1,1,1,b),nocc**2,0.d0,v,nocc)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,b),nocc**2,
$dcore(imem+
$(b-1)*nocc*nvirt+(la-1)*nocc*nvirt**2),nocc,1.d0,v,nocc**2)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,a),nocc**2,
$dcore(imem+
$(b-1)*nocc*nvirt+(lb-1)*nocc*nvirt**2),nocc,0.d0,scro,nocc**2)
do k=1,nocc
call daxpy(nocc**2,1.d0,scro(k,1,1),nocc,v(1,1,k),1)
enddo
cc TIMEADD(times(1,15),times)
cc TIME0(times)
C 2.tag
call dgemm('n','t',nocc,nocc**2,nocc,-1.d0,tt(1,1,a,b),nocc,
$ijka(1,1,1,b),nocc**2,1.d0,v,nocc)
call dgemm('n','t',nocc**2,nocc,nocc,-1.d0,ijka(1,1,1,b),nocc**2,
$tt(1,1,b,a),nocc,1.d0,v,nocc**2)
call dgemm('t','n',nocc**2,nocc,nocc,-1.d0,
$ijka(1,1,1,a),nocc,tt(1,1,b,b),nocc,1.d0,v,nocc**2)
call dcopy(nocc**3,v,1,w,1)
do k=1,nocc
do j=1,nocc
c do i=1,nocc
c w(i,j,k)=w(i,j,k)+v(i,k,j)
c enddo
call daxpy(nocc,1.d0,v(j,k,1),nocc**2,w(j,1,k),nocc)
enddo
enddo
cc TIMEADD(times(1,16),times)
c }}}
ELSE
c {{{ first 6 term abci*T2
cc TIME0(times(1,99))
c a)
cc TIME0(times)
call dgemm('n','t',nocc,nocc**2,nvirt,1.d0,dcore(imem+
$(a-1)*nocc*nvirt+(lb-1)*nocc*nvirt**2),nocc,
$tt(1,1,1,c),nocc**2,0.d0,w,nocc)
cc TIMEADD(times(1,40),times)
c d)
cc TIME0(times)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,b),nocc**2,
$dcore(imem+
$(c-1)*nocc*nvirt+(la-1)*nocc*nvirt**2),nocc,1.d0,w,nocc**2)
cc TIMEADD(times(1,44),times)
c f)
cc TIME0(times)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,a),nocc**2,
$dcore(imem+
$(b-1)*nocc*nvirt+(c-ind)*nocc*nvirt**2),nocc,0.d0,scro,nocc**2)
cc TIMEADD(times(1,47),times)
c old do k=1,nocc
c old call daxpy(nocc**2,1.d0,scro(k,1,1),nocc,w(1,1,k),1)
c old enddo
c c) <jd|ba>*R(ki,dc)=<jd|ba>*T(ki,cd)
cc TIME0(times)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,rr(1,1,1,c),nocc**2,
$dcore(imem+
$(b-1)*nocc*nvirt+(la-1)*nocc*nvirt**2),nocc,1.d0,scro,nocc**2)
cc TIMEADD(times(1,42),times)
cccc c)
c old call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,tt(1,1,1,c),nocc**2,
c old $dcore(imem+
c old $(b-1)*nocc*nvirt+(la-1)*nocc*nvirt**2),nocc,1.d0,scro,nocc**2)
c old do k=1,nocc
c old do j=1,nocc
c old call daxpy(nocc,1.d0,scro(j,k,1),nocc**2,w(j,1,k),nocc)
c old enddo
c old enddo
c b) <id|ac>*R(jk,db)=<id|ac>*T(jk,bd)
cc TIME0(times)
call dgemm('n','t',nocc,nocc**2,nvirt,1.d0,dcore(imem+
$(a-1)*nocc*nvirt+(c-ind)*nocc*nvirt**2),nocc,
$rr(1,1,1,b),nocc**2,1.d0,w,nocc)
cc TIMEADD(times(1,41),times)
c e) R(ij,da)*<kd|bc>=T(ij,ad)*<kd|bc>
cc TIME0(times)
call dgemm('n','t',nocc**2,nocc,nvirt,1.d0,rr(1,1,1,a),nocc**2,
$dcore(imem+
$(c-1)*nocc*nvirt+(lb-1)*nocc*nvirt**2),nocc,1.d0,w,nocc**2)
cc TIMEADD(times(1,45),times)
cc TIMEADD(times(1,17),times(1,99))
c old do k=1,nocc
c old call tradd(nocc,1.d0,scro(1,1,k),w(1,1,k))
c old enddo
c }}}
c
c {{{ second 6 term ijka*T2
c b)
cc TIME0(times)
call dgemm('n','n',nocc,nocc**2,nocc,-1.d0,tt(1,1,a,c),nocc,
$ijka(1,1,1,b),nocc,1.d0,w,nocc)
cc TIMEADD(times(1,51),times)
c c)
cc TIME0(times)
call dgemm('n','n',nocc**2,nocc,nocc,-1.d0,ijka(1,1,1,b),nocc**2,
$tt(1,1,a,c),nocc,1.d0,w,nocc**2)
cc TIMEADD(times(1,52),times)
c a)
cc TIME0(times)
call dgemm('n','t',nocc,nocc**2,nocc,-1.d0,tt(1,1,a,b),nocc,
$ijka(1,1,1,c),nocc**2,1.d0,w,nocc)
cc TIMEADD(times(1,50),times)
c f)
cc TIME0(times)
call dgemm('t','n',nocc**2,nocc,nocc,-1.d0,ijka(1,1,1,c),nocc,
$tt(1,1,a,b),nocc,1.d0,scro,nocc**2)
cc TIMEADD(times(1,55),times)
c e) <ki|la>*T(lj,cb)
cc TIME0(times)
call dgemm('n','n',nocc**2,nocc,nocc,-1.d0,ijka(1,1,1,a),nocc**2,
$tt(1,1,c,b),nocc,1.d0,scro,nocc**2)
cc TIMEADD(times(1,54),times)
c d) T(kl,cb)*<li|ja>
cc TIME0(times)
call dgemm('n','n',nocc,nocc**2,nocc,-1.d0,tt(1,1,c,b),nocc,
$ijka(1,1,1,a),nocc,1.d0,scro,nocc)
cc TIMEADD(times(1,53),times)
cc TIME0(times)
do k=1,nocc
call daxpy(nocc**2,1.d0,scro(k,1,1),nocc,w(1,1,k),1)
enddo
cc TIMEADD(times(1,48),times)
c old do k=1,nocc
c old call dgemv('t',nocc,nocc**2,-1.d0,ijka(1,1,1,a),nocc,
c old $tt(1,k,b,c),1,1.d0,w(1,1,k),1)
c old call dgemm('t','t',nocc,nocc,nocc,-1.d0,ijka(1,1,k,a),nocc,
c old $tt(1,1,b,c),nocc,1.d0,w(1,1,k),nocc)
c old do l=1,nocc
c old call dger(nocc,nocc,-1.d0,ijka(1,k,l,c),1,
c old $tt(1,l,b,a),1,w(1,1,k),nocc)
c old enddo
c old enddo
cc TIMEADD(times(1,18),times(1,99))
c }}}
ENDIF
C Construct V Eq.5
cc TIME0(times)
c call dcopy(nocc**3,w,1,v,1)
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(i)
do i=1,nocc
v(i,1:nocc,1:nocc)=w(i,1:nocc,1:nocc)
$ +t(i,a)*ijab(1:nocc,1:nocc,b,c)
enddo
C$OMP END PARALLEL DO
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(j)
do j=1,nocc
v(1:nocc,j,1:nocc)=v(1:nocc,j,1:nocc)
$ +t(j,b)*ijab(1:nocc,1:nocc,a,c)
enddo
C$OMP END PARALLEL DO
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(k)
do k=1,nocc
v(1:nocc,1:nocc,k)=v(1:nocc,1:nocc,k)
$ +t(k,c)*ijab(1:nocc,1:nocc,a,b)
enddo
C$OMP END PARALLEL DO
cc TIMEADD(times(1,19),times)
C Calculate Energy
c {{{ canonical
sum=0.d0
IF(localcc.eq.'off ')THEN
cc TIME0(times)
IF(a.eq.b.or.b.eq.c)THEN
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(i,j,k,delta3,fkj,istart,delta2,fkji,y)
C$OMP& REDUCTION(+:sum)
do k=1,nocc
do j=k,nocc
delta3=1.d0
if(j.eq.k)delta3=delta3+1
fkj=fdo(k)+fdo(j)
if(j.eq.k)then
istart=j+1
else
istart=j
endif
do i=istart,nocc
delta2=delta3
if(i.eq.j)delta2=delta2+1
fkji=fkj+fdo(i)
y=v(i,j,k)+v(k,i,j)+v(j,k,i)
C I
sum=sum+(
$-(w(i,j,k)+w(k,i,j)+w(j,k,i))*(v(i,j,k)+v(k,i,j)+v(j,k,i))
$+(w(i,j,k)*v(i,j,k)
$+w(k,i,j)*v(k,i,j)
$+w(j,k,i)*v(j,k,i))*3.d0
$)/((fkji-fcba)*delta2)
enddo
enddo
enddo
C$OMP END PARALLEL DO
sum=2.d0*sum
ELSE ! .not.(a.eq.b.or.b.eq.c)
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(i,j,k,delta3,fkj,istart,delta2,fkji,y,z)
C$OMP& REDUCTION(+:sum)
do k=1,nocc
do j=k,nocc
delta3=1.d0
if(j.eq.k)delta3=delta3+1
fkj=fdo(k)+fdo(j)
if(j.eq.k)then
istart=j+1
else
istart=j
endif
do i=istart,nocc
c do i=j,nocc
delta2=delta3
if(i.eq.j)delta2=delta2+1
fkji=fkj+fdo(i)
y=v(i,j,k)+v(k,i,j)+v(j,k,i)
z=v(i,k,j)+v(j,i,k)+v(k,j,i)
C I
sum=sum+(
$+(w(i,j,k)+w(k,i,j)+w(j,k,i))*(y-2.d0*z)
$+(w(i,k,j)+w(j,i,k)+w(k,j,i))*(z-2.d0*y)
$+(w(i,j,k)*v(i,j,k)
$+w(k,i,j)*v(k,i,j)
$+w(j,k,i)*v(j,k,i)
$+w(i,k,j)*v(i,k,j)
$+w(j,i,k)*v(j,i,k)
$+w(k,j,i)*v(k,j,i))*3.d0
$)/((fkji-fcba)*delta2)
enddo
enddo
enddo
C$OMP END PARALLEL DO
ENDIF
cc TIMEADD(times(1,20),times)
c }}}
ELSE ! localcc
c {{{ localcc energy
C Denominator in W!
do k=1,nocc
do j=1,nocc
fkj=fdo(k)+fdo(j)
do i=1,nocc
fkji=fkj+fdo(i)
w(i,j,k)=w(i,j,k)/(fkji-fcba)
c to compare with laplace
c w(i,j,k)=-w(i,j,k)/dsqrt(-fkji+fcba)
c v(i,j,k)=v(i,j,k)/dsqrt(-fkji+fcba)
enddo
enddo
enddo
c do k=1,nocc
C transforming the indices
call dgemv('t',nocc,nocc**2,1.d0,v,nocc,ui,1,0.d0,v1,1)
call dgemv('t',nocc,nocc**2,1.d0,w,nocc,ui,1,0.d0,w1,1)
call dgemv('n',nocc**2,nocc,1.d0,v,nocc**2,ui,1,0.d0,v3,1)
call dgemv('n',nocc**2,nocc,1.d0,w,nocc**2,ui,1,0.d0,w3,1)
do j=1,nocc
call dgemv('n',nocc,nocc,1.d0,v(1,1,j),nocc,ui,1,0.d0,
$v2(1,j),1)
call dgemv('n',nocc,nocc,1.d0,w(1,1,j),nocc,ui,1,0.d0,
$w2(1,j),1)
enddo
c
IF(a.eq.b)THEN
do j=1,nocc
sum=sum
$+ (w3(j,j)-w1(j,j))*(v3(j,j)-v1(j,j))
do i=j+1,nocc
sum=sum
$+ w3(i,j)*(2.d0*v3(i,j)-v1(i,j)-v2(j,i))
$+ w2(j,i)*(v2(j,i)-v3(i,j))
$+ w1(i,j)*(v1(i,j)-v3(i,j))
enddo
enddo
sum=2.d0*sum
ELSE IF(b.eq.c)THEN
do j=1,nocc
sum=sum
$+ (w3(j,j)-w1(j,j))*(v3(j,j)-v1(j,j))
do i=j+1,nocc
sum=sum
$+ w1(i,j)*(2.d0*v1(i,j)-v3(i,j)-v2(j,i))
$+ w2(j,i)*(v2(j,i)-v1(i,j))
$+ w3(i,j)*(v3(i,j)-v1(i,j))
enddo
enddo
sum=2.d0*sum
ELSE
do j=1,nocc
c if(i.eq.j)then
c y=v(k,j,j)+v(j,k,j)+v(j,j,k)
c sum=sum
c $+ w(j,j,k)*(3.d0*v(j,j,k)-y)
c $+ w(j,k,j)*(3.d0*v(j,k,j)-y)
c $+ w(k,j,j)*(3.d0*v(k,j,j)-y)
y=v1(j,j)+v2(j,j)+v3(j,j)
sum=sum
$+ w3(j,j)*(3.d0*v3(j,j)-y)
$+ w2(j,j)*(3.d0*v2(j,j)-y)
$+ w1(j,j)*(3.d0*v1(j,j)-y)
c endif
do i=j+1,nocc
c if(i.gt.j)then
c sum=sum+
c $ 2.d0/3.d0*
c $ (w(i,j,k)+w(j,k,i)+w(k,i,j)-w(i,k,j)-w(j,i,k)-w(k,j,i))*
c $ (v(i,j,k)+v(k,i,j)+v(j,k,i)-v(i,k,j)-v(j,i,k)-v(k,j,i))
c $+ w(i,j,k)*(2.d0*v(i,j,k)-v(i,k,j)-v(k,j,i))
c $+ w(j,k,i)*(2.d0*v(j,k,i)-v(k,j,i)-v(j,i,k))
c $+ w(k,i,j)*(2.d0*v(k,i,j)-v(j,i,k)-v(i,k,j))
c $+ w(i,k,j)*(2.d0*v(i,k,j)-v(i,j,k)-v(k,i,j))
c $+ w(j,i,k)*(2.d0*v(j,i,k)-v(k,i,j)-v(j,k,i))
c $+ w(k,j,i)*(2.d0*v(k,j,i)-v(j,k,i)-v(i,j,k))
sum=sum+
$ 2.d0/3.d0*
$ (w3(i,j)+w2(j,i)+w1(i,j)-w2(i,j)-w3(j,i)-w1(j,i))*
$ (v3(i,j)+v1(i,j)+v2(j,i)-v2(i,j)-v3(j,i)-v1(j,i))
$+ w3(i,j)*(2.d0*v3(i,j)-v2(i,j)-v1(j,i))
$+ w2(j,i)*(2.d0*v2(j,i)-v1(j,i)-v3(j,i))
$+ w1(i,j)*(2.d0*v1(i,j)-v3(j,i)-v2(i,j))
$+ w2(i,j)*(2.d0*v2(i,j)-v3(i,j)-v1(i,j))
$+ w3(j,i)*(2.d0*v3(j,i)-v1(i,j)-v2(j,i))
$+ w1(j,i)*(2.d0*v1(j,i)-v2(j,i)-v3(i,j))
c endif
enddo
enddo
c enddo
ENDIF
c }}}
ENDIF
et=et+sum/delta
enddo
prk=prk+nvirt-astart+1
enddo
enddo
ind=ind+n
enddo
write(iout,'(" 100% done.")')
if(localcc.eq.'off ') et=2.d0*et
c write(6,'(f17.12)')et
c call timer
return
end subroutine!}}}
c
cC {{{ laplace_localbtcorr
c************************************************************************
c subroutine localbtcorr_lap(t1,tt,fdo,fdv,aijk,abij,w,v,
c $bsymmv,symmocc,dgroup,first,last,co,imem,dcore,
c $et,nquad,quad,t,abci,ttl,ss,rr,rrd,abci_inmem,times,cmo,
c $jij,jia,jab,dfnb,localcc,talg,lljab,lljpai,apbc,nblocki,iblocki,
c $tmpvsize,IabIJ,ttl2,ttl3,ccsdalg,jai,
c $irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg,iout,
c $mpi_rank,mpi_size,mpi_ccsd,ccsdmkl,ptthreads,omp_max_threads,
c $ccsd_communicator,master_thread)
c************************************************************************
c implicit none
cc for mpi
c logical mpi_ccsd
c#if defined(MPI)
c include "mpif.h"
c integer comm_scr, stat(MPI_STATUS_SIZE)
c#endif
c integer mpi_rank, mpi_size, mpi_err, request, npr
c integer find, lind, nrank, bsize, kk, prk_sent
c integer icount(0:nocc),kjinum(nocc), ccsd_communicator
c integer nummes, max_threads,jfrstm1,jpullback,actblocknn
c logical flag,ptdone
cc end mpi
c character*3 ccsdmkl
c integer ptthreads,omp_max_threads,imem,jinrr
c real*8 dcore(*)
c integer irecln,ibufln,nbasis,nocc,nvirt,dgroup,nirmax,iout
c integer bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
c integer first(0:5,8,8,nbasis),last(0:5,8,8,nbasis),nquad,iquad
c integer co(nbasis),rrd,dfnb
c integer a,b,c,i,j,k,kmax,jj
c real*8 t(nvirt,nocc),t1(nvirt,nocc),tt(nvirt,nvirt,nocc,nocc)
c real*8 fdv(nvirt),fdo(nocc),quad(nvirt+nocc,nquad),fi,fj,fij,fbij
c real*8 ttl(nvirt,nvirt,nocc,1),ss(nvirt,nvirt,nocc,nocc)
c real*8 ttl2(nvirt,nvirt,1,nocc),ttl3(nvirt,nvirt),fb
c real*8 rr(nvirt,nvirt,nocc,rrd)
c real*8 et,sum,y,z,fc
c real*8 w(nvirt,nvirt,nvirt),v(nvirt,nvirt,nvirt)
c real*8 pr,prold,IabIJ(nvirt,nvirt)
c real*8 delta,delta1,delta2,delta3,dkji
c real*8 fcb,fcba,fkj,fkji,tmpvsize(*)
c real*8 abij(nvirt,nvirt,nocc,*) !<12|12>
c real*8 aijk(nvirt,nocc,nocc,nocc) !<12|12>
c real*8 abci(nvirt,nvirt,nvirt,*),cmo(nocc,nocc)
c real*8 jij(dfnb,nocc,nocc),jab(dfnb,nvirt,nvirt)
c real*8 jia(nocc,nvirt,dfnb),lljab(dfnb,nvirt,nvirt)
c real*8 lljpai(dfnb,nvirt,nocc),apbc(dfnb,nvirt,nvirt),rdummy
c logical notend,abci_inmem,denom,abijinmem,master_thread
c integer ind,n,astart,istart,li,lj,kinrr,prk,kji,ljj,jst
c integer nblocki,iblocki,actblock,actblocki
c character*4 localcc,talg
c
c character*8 ccsdalg
c real*8 scr(nocc, nvirt), jai(nvirt, nocc, dfnb)
c real*8 scr1v(nvirt), scr1o(nocc)
c
c integer nind ! dummy
c integer indices(nocc) ! dummy
cc
c real*8 times(6,0:100),fact
cc real*8 lt1(nvirt,nocc),scrt(nvirt,nvirt,nocc,nocc)
cc real*8 cpabci(nvirt,nvirt,nvirt,nocc),scr2(nvirt,nocc,nocc,nocc)
c logical can,iinbl
c real*8 epair(nocc,nocc)
c
c ptthreads=1 !!!!!!!!!!
c
c nummes = 0
c#if defined(MPI)
c comm_scr = 0
c if(localcc.eq.'off '.and.mpi_size.ne.1) then
c write(iout,*)'talg=lapl is not implemented with MPI&localcc=off'
c call mrccend(1)
c endif
c#endif
c abijinmem=.true.
c if (localcc.ne.'off ') abijinmem=.false.
cc
c if(localcc .eq. 'off ') then
c ! jai contains Jia, transpose it
c do k = 1, dfnb
c call dcopy(nvirt*nocc, jai(1, 1, k), 1, scr, 1)
c
c do i = 1, nocc
c call dcopy(nvirt, scr(i, 1), nocc, jai(1, i, k), 1)
c end do
c end do
c end if
c
c epair=0.d0
cc open(112,file='tripenergies')
cc
c if (.false.) then
c if (talg.eq.'topr') then
c write(*,*) 'warning cmo=I test'
c cmo=0.d0
c do i=1,nocc
c cmo(i,i)=1.d0
c enddo
c endif
c endif
cc write(*,*) 'rrd=nocc, warning',rrd
cc rrd=nocc
c can=.false.
c if (can) then
c write(*,*) 'warning quad=1'
c quad=1.d0
c localcc='off '
c endif
c if (localcc.eq.'off '.and..not.abci_inmem) then
c write(iout,*)'Full abci list is stored in memory for canonical',
c $'Laplace transformed (T), insufficient memory, should not be here'
c call mrccend(1)
c endif
cc write(*,*) 'warning can=T'
cc can=.true.
cc start
c et=0.d0
c quad=dsqrt(quad)
c prk=0
c pr=0.d0
c prold=0.d0
c denom=talg.eq.'topr'.or.talg.eq.'to '
c fact=1.d0
c kmax=1
c if(localcc.eq.'off ') kmax=nocc
cc all kji combination:
c icount(0:nocc) = 0
c kji=0
c npr=max(0,nocc-iblocki-1) ! max number of j for which j is outside the block and Ajk is needed
c do k=1,kmax
c do i=k+1,nocc ! j=k
c kji=kji+1
c enddo
c kjinum(1) = kji
c icount(1) = (6 * (nvirt**4 + nvirt**3*nocc) + ! T3
c $ 35 * nvirt**3) * kji + nvirt**3*dfnb + ! energy and <ab|cj>
c $ nvirt**3*dfnb*2 + npr*nvirt**4 ! Ajk, Akj and J*A
c do j=k+1,nocc ! j>k
c do i=j,nocc
c kji=kji+1
c enddo
c kjinum(j) = kji
c icount(j) = (6 * (nvirt**4 + nvirt**3*nocc) + ! T3
c $ 35 * nvirt**3) * kji + nvirt**3*dfnb*j + ! energy and <ab|cj>
c $ 2*nvirt**3*dfnb+nvirt**4*j*(max(npr-j-1,0))/2 ! sum_{jj=1}^j extra cost of Ajk, Akj and J*A
c enddo ! j
c enddo ! k
c dkji=dble(kji*nquad)
cc
c find = 1 ! will be first j index of mpi thread
c lind = nocc ! will be last j index of mpi thread
c notend=.true.
c#if defined(MPI)
c if(mpi_ccsd) then
c call ptindices_old(icount, nocc, mpi_rank, mpi_size, find,lind,
c $ notend, max_threads)
c
c if(mpi_size .gt. 1 .and. mpi_rank .eq. mpi_size-1) then
c if(mpi_size .ne. max_threads) then
c write(6, '(" Warning: only ", i3,
c $ " MPI threads were utilized for (T)"/)') max_threads
c endif
c endif
c
c if(mpi_size .gt. 1) then
c if(mpi_rank .eq. 0 .and. mpi_size .gt. 1) then
c write(iout, '(" Cost function")')
c write(iout, '(" j cost tot cost of k ")')
c do k = 1, nocc
c write(*, '(i6," ",2es13.6)') k, dble(icount(k)),
c $ dble(icount(k)-icount(k-1))
c end do
c write(iout, '(" Allocation:")')
c write(iout, '(" Rank Indices")')
c end if
c call flush(iout)
c call sleep_f(1)
c call MPI_Barrier(ccsd_communicator, mpi_err)
c
c write(*, '(i6,i7,"-",i2,i6,"%")') mpi_rank, find, lind,
c $ max(nint(dble((icount(lind)-icount(find-1)))/icount(nocc)*100),0)
c call flush(iout)
c call sleep_f(1)
c call MPI_Barrier(ccsd_communicator, mpi_err)
c end if
c endif
c#endif
c jfrstm1=max(1,find-1) ! first occ index in the stored (ab|ci) array minus 1
c jpullback=max(0,find-2) ! helps to set the staring occ index of the stored (ab|ci) array
c write(iout,'(" 1% done.")')
cc
c if (notend) then
c notend=.false.
c do iquad=1,nquad
cc multiply with Laplace factors
c if(localcc.eq.'off ')then
cc test canonical if(.false.) then
cc {{{ localcc.eq.'off ' case: FOR TEST PURPOSES, not efficient
c do i=1,nocc
c fi=quad(i,iquad)
c t(1:nvirt,i)=t1(1:nvirt,i)*quad(nocc+1:nocc+nvirt,iquad)*fi
c enddo
cc lt1=t
cc call dgemm('n','n',nvirt,nocc,nocc,1.d0,lt1,nvirt,cmo,nocc,
cc $ 0.d0,t,nvirt)
cC read integrals to memory
c if(ccsdalg .eq. 'disk ') then
c call abijread(abij,bsymmv,symmocc,dgroup,first,last,co,v,
c $ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
c call aijkread(aijk,bsymmv,symmocc,dgroup,first,last,co,v,
c $ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
c else
c ! (ai|bj) = <ab|ij>
c call dgemm('n', 't', nvirt*nocc, nvirt*nocc, dfnb,
c $ 1.d0, jai, nvirt*nocc, jai, nvirt*nocc,
c $ 0.d0, abij, nvirt*nocc)
c ! aibj -> abij
c do j = 1, nocc
c do a=1,nvirt
c call dcopy(nocc*nvirt, abij(a, 1, 1, j), nvirt, scr,1)
c
c do b=1,nvirt
c abij(a, b, 1:nocc, j) = scr(1:nocc, b)
c enddo
c enddo
c enddo
c
c ! (aj|ik) = <ai|jk>
c call dgemm('n', 'n', nvirt*nocc, nocc**2, dfnb,
c $ 1.d0, jai, nvirt*nocc, jij, dfnb,
c $ 0.d0, aijk, nvirt*nocc)
c ! ajik -> aijk
c do k = 1, nocc
c do a = 1, nvirt
c do j = 1, nocc
c scr1o(j+1:nocc) = aijk(a, j+1:nocc, j, k)
c aijk(a, j+1:nocc, j, k) = aijk(a, j, j+1:nocc, k)
c aijk(a, j, j+1:nocc, k) = scr1o(j+1:nocc)
c enddo
c enddo
c end do
c end if
cC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
cC$OMP& PRIVATE(j,fj,i,fij,b,fbij)
c do j=1,nocc
c fj=quad(j,iquad)
c do i=1,nocc
c fij=quad(i,iquad)*fj
c do b=1,nvirt
c fbij=quad(nocc+b,iquad)*fij
c abij(1:nvirt,b,i,j)=abij(1:nvirt,b,i,j)*
c $ quad(nocc+1:nocc+nvirt,iquad)*fbij
c enddo
c enddo
c enddo
cC$OMP END PARALLEL DO
c do j=1,nocc
c fj=quad(j,iquad)
c if (iquad.gt.1) fj=fj/quad(j,iquad-1)
c do i=1,nocc
c fij=quad(i,iquad)*fj
c if (iquad.gt.1) fij=fij/quad(i,iquad-1)
c do b=1,nvirt
c do a=1,nvirt
c fbij=fij*quad(nocc+b,iquad)*quad(nocc+a,iquad)
c if (iquad.gt.1)
c $ fbij=fbij/quad(nocc+a,iquad-1)/quad(nocc+b,iquad-1)
c ss(a,b,i,j)= tt(a,b,i,j)*fbij ! ss is in tt
c enddo
c enddo
c enddo
c enddo
c if(ccsdalg .eq. 'disk ') then
cc call abciread(1,nvirt,abci,bsymmv,symmocc,dgroup,co,
cc $first,last,v,
cc $irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
c call abciread(1,nocc,abci,bsymmv,symmocc,dgroup,co,
c $ first,last,v,irecln,ibufln,nbasis,nocc,nvirt,
c $ nirmax,multpg)
c else
c ! (ac|bi) = <ab|ci>
c call dgemm('t', 't', nvirt**2, nvirt*nocc, dfnb,
c $ 1.d0, jab, dfnb, jai, nvirt*nocc,
c $ 0.d0, abci, nvirt**2)
c ! acbi -> abci
c do i = 1, nocc
c do a = 1, nvirt
c do b = 1, nvirt
c scr1v(b+1:nvirt) = abci(a, b+1:nvirt, b, i)
c abci(a, b+1:nvirt, b, i) = abci(a, b, b+1:nvirt, i)
c abci(a, b, b+1:nvirt, i) = scr1v(b+1:nvirt)
c end do
c end do
c end do
c end if
c do c=1,nvirt
c fc=quad(nocc+c,iquad)
c do b=1,nvirt
c fj=quad(nocc+b,iquad)*fc
c do a=1,nvirt
c fij=quad(nocc+a,iquad)*fj
c do i=1,nocc
c abci(a,b,c,i)=abci(a,b,c,i)*quad(i,iquad)*fij
c enddo
c enddo
c enddo
c enddo
c do j=1,nocc
c fj=quad(j,iquad)
c do i=1,nocc
c fij=quad(i,iquad)*fj
c do k=1,nocc
c do a=1,nvirt
c aijk(a,k,i,j)=aijk(a,k,i,j)*fij*
c $ quad(nocc+a,iquad)/quad(k,iquad)
c enddo
c enddo
c enddo
c enddo
cc }}}
c else ! if localcc=on
cc write(*,*) 'warning kmax'
cc kmax=nocc
cc multiply with Laplace factors and trf to the Mayer basis: (parts of) T2, abci, abij and aijk
c call trf2laplbasis(tt,ttl,jij,jab,jia,nocc,nvirt,dfnb,abij,
c $abci,aijk,quad,iquad,nquad,cmo,aijk,lljpai,lljab,rr,rr,w,w,aijk,v,
c $t,t,t1,talg,fdo,v,w,abci_inmem,ttl2,w,w,IabIJ)
c endif
c actblock=0
c if (find.gt.1) actblock=1
cC
cc transpose tt(a,b,i,k) to rr(b,a,i,k) for all k
c if (rrd.eq.nocc) then
c do k=1,nocc
c do i=1,nocc
c call tr(nvirt,ss(1,1,i,k),rr(1,1,i,k))
c enddo
c enddo
c endif
c ind=1
c n=nocc
cc
c do k=ind,ind+n-1
c if (k.gt.kmax) exit
cc transpose tt(a,b,i,k) to rr(b,a,i) for k
c if (rrd.eq.1) then
c do i=1,nocc
c call tr(nvirt,ss(1,1,i,k),rr(1,1,i,1))
c enddo
c kinrr=1
c else
c kinrr=k
c endif
cc {{{ localcc.eq.'off ' case: FOR TEST PURPOSES, not efficient
c if(localcc.eq.'off ')then
c if (talg.eq.'lapl'.or.talg.eq.'lato') then
cc make ttl(-,~,i,k) & ttl2(-,~,k,i) from ss(-,-,-.-)
c do i=1,nocc
c do b=1,nvirt
c fb=quad(nocc+b,iquad)**2
c ttl (1:nvirt,b,i,1)=tt(1:nvirt,b,i,k)/fb
c ttl2(1:nvirt,b,1,i)=tt(1:nvirt,b,k,i)/fb
c enddo
c enddo
c endif
c endif
cc }}}
cc
c find = k
c do j=find,min(lind,nocc)
c jinrr=j !!!!!!!!!!!
c TIME0(times(1,98))
c if(mpi_rank .eq. mpi_size-1) then
c pr=dble(prk)/dkji !progress counter
c else
c pr = dble(prk) / ((kjinum(lind) - kjinum(find-1))*nquad)
c end if
c if(pr.ge.prold+0.2d0.and.pr.ne.1.d0)then
c prold=pr
c if(mpi_rank .ne. 0)write(iout,'(i4,"% done.")')nint(pr*100)! last rank prints correct percentage
c if(mpi_rank .eq. mpi_size-1)
c $ write(6, '(i4, "% done.")') nint(pr * 100) ! for master
c endif
c actblocknn=max(actblock-1,0) ! make the actblock-1 index non-negative
c lj=j
cc {{{ if (.not.abci_inmem)
c if (.not.abci_inmem) then
c if (j.gt.1) lj=j-actblocknn*iblocki-jpullback ! position of j in abci array, if find>1 lj starts from 2, hence the -max(0,lind-2)
c if (j.eq.find.or.
c $ (j.ne.jfrstm1+1.and. ! j=2 is done if find=1, do not assembly it if actblock=1
c $ j.eq.jfrstm1+1+actblocknn*iblocki)) then! make (ab|cj) for the entire actual block if j= 1st index of the actual block
c jst=j
c if (j.eq.1) jst=2
c do jj=jst,min(jst+iblocki-1,nocc)
c call dgemm('t','n',nvirt**2,nvirt,dfnb,1.d0,lljab,dfnb,
c $lljpai(1,1,jj),dfnb,0.d0,w,nvirt**2) ! (a,b|c,jj)=lljab(\bar a \bar b | P)^T lljpai(P | \bar c \bar jj)
c ljj=jj-actblocknn*iblocki-jpullback ! position of jj in abci array
c if (actblock.eq.0) ljj=jj
c do b=1,nvirt
c do c=1,nvirt
c abci(1:nvirt,c,b,ljj)=w(1:nvirt,b,c) ! <a,b|c,j>= (a,c|b,j)
c enddo
c enddo
c enddo ! jj
c endif
c if (jfrstm1+actblock*iblocki.lt.nocc) then ! if not last block -> (ab|ci) will be missed for i>jfrstm1+actblocki*iblocki -> contruct Ajk(P,b,c)
cc Ajk(P,c,b)=lljab(P,\bar c,\bar d)*ttl(\bar b,\tilde d,\bar j,\bar k)^T
c call dgemm('n','t',dfnb*nvirt,nvirt,nvirt,1.d0,
c $ lljab,dfnb*nvirt,ttl(1,1,j,k),nvirt,0.d0,apbc,dfnb*nvirt)
c if (j.ne.k) then ! j=k is taken care of inside wabc in the j=k case
cc Ajk(P,b,c)=Ajk(P,c,b)
c do b=1,nvirt
c do c=b+1,nvirt
c tmpvsize(1:dfnb)=apbc(1:dfnb,c,b)
c apbc(1:dfnb,c,b)=apbc(1:dfnb,b,c)
c apbc(1:dfnb,b,c)=tmpvsize(1:dfnb)
c enddo
c enddo
cc Ajk(P,b,c)=Ajk(P,b,c)+lljab(P,\bar b,\bar d)*ttl(\bar c,\tilde d,\bar k,\bar j)^T
c call dgemm('n','t',dfnb*nvirt,nvirt,nvirt,1.d0,
c $ lljab,dfnb*nvirt,ttl2(1,1,k,j),nvirt,1.d0,apbc,dfnb*nvirt)
c endif ! j.ne.k
c endif ! not last block
c endif
cc }}}
c delta2=1.d0
c if(j.eq.k)delta2=delta2+1
c fkj=fdo(k)+fdo(j)
c if(j.eq.k)then !i=j=k => no contribution!
c istart=j+1
c else
c istart=j
c endif
c do i=istart,nocc
c iinbl=.true.
c li=i-ind+1
cc {{{ if (.not.abci_inmem)
cc if(i.gt.ind+n-1.and.i.ne.j)then
cc li=n+2
cc if (.not.abci_inmem.and.localcc.eq.'off ')
cc $ call abciread(i,1,dcore(imem+(li-1)*nvirt**3),
cc $bsymmv,symmocc,dgroup,co,first,last,v,
cc $irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
cc endif
cc if(i.gt.ind+n-1.and.i.eq.j)then
cc li=lj
cc endif
c if (.not.abci_inmem) then
c actblocki=actblock
c if (j.eq.1) actblocki=1
c if (i.le.jfrstm1+actblocki*iblocki) then ! if i is still in the actual block, i.e. i <= the index of the last stored (ab|ci)
c li=i-max(actblocki-1,0)*iblocki-jpullback ! no j=1 & i=1 contribution, i>1
c else ! i.eq.j is in the other branch
c if (j.ne.k) then ! i.ne.j must be fulfilled
c call dgemm('t','n',nvirt,nvirt**2,dfnb,1.d0,
c $ lljpai(1,1,i),dfnb,apbc,dfnb,0.d0,w,nvirt) ! w(a,b,c)= lljpai(P,\bar a,\bar I) Ajk(P,b,c)
c elseif (j.eq.k) then
c call dgemm('t','n',nvirt,nvirt**2,dfnb,1.d0,
c $ lljpai(1,1,i),dfnb,apbc,dfnb,0.d0,v,nvirt) ! v(a,c,b)= lljpai(P,\bar a,\bar I) Ajk(P,c,b)
c endif
c iinbl=.false.
c endif
c endif ! .not.abci_inmem
cc }}}
c delta3=delta2
c if(i.eq.j)delta3=delta3+1
c fkji=fkj+fdo(i)
c if (talg.eq.'lapl'.or.talg.eq.'lato') then
cc make ttl3(-,~,i,j) & ttl4(-,~,j,i) from ss(-,-,-,-)
c do b=1,nvirt
c fb=quad(nocc+b,iquad)**2
c ttl3(1:nvirt,b)=ss(1:nvirt,b,i,j)/fb
c IabIJ(1:nvirt,b)=ss(1:nvirt,b,j,i)/fb ! ttl4
c enddo
c else
c call dcopy(nvirt**2,ss(1,1,i,j),1,ttl3,1)
c call dcopy(nvirt**2,ss(1,1,j,i),1,IabIJ,1) ! ttl4
c endif
cc compute W and V for all abc indices
c call wabc(nocc,nvirt,i,j,k,imem,dcore,t,ttl,rr,rrd,
c $ abci(1,1,1,li), abci(1,1,1,lj), abci(1,1,1,k-ind+1),
c $kinrr,jinrr,w,v,abij,aijk,times,'lapl',ss,iinbl,localcc,IabIJ,
c $lljpai,dfnb,ttl2,ttl3,IabIJ,1,abijinmem,rdummy,ccsdmkl,ptthreads,
c $omp_max_threads)
cc call wabc_old(nocc,nvirt,i,j,k,1,abci,t,ttl,rr,rrd,li,lj,ind,
cc $kinrr,w,v,abij,aijk,times,'lapl',ss,iinbl,localcc,IabIJ,lljpai,
cc $dfnb,ttl2,ttl3,IabIJ,1,abijinmem,rdummy)
cc {{{ w_abc debug with loops
cc if (.false.) then
cc write(*,*) 'loops'
cc do a=1,nvirt
cc do b=1,nvirt
cc do c=1,nvirt
cc sum=0.d0
cc do d=1,nvirt
cc sum=sum+abci(c,b,d,j)*ttl(a,d,i,k) !e
cc sum=sum+abci(a,b,d,j)*ttl2(c,d,k,i) !f
cc sum=sum+abci(b,c,d,k)*ttl3(a,d) !d
cc sum=sum+abci(a,c,d,k)*IabIJ(b,d) !a
cc sum=sum+abci(b,a,d,i)*ttl2(c,d,k,j) !b
cc sum=sum+abci(c,a,d,i)*ttl(b,d,j,k) !c
cc enddo
cc do l=1,nocc
cc sum=sum-aijk(b,l,j,i)*ss(a,c,l,k) !c
cc sum=sum-aijk(b,l,j,k)*ss(c,a,l,i) !b
cc sum=sum-aijk(c,l,k,j)*ss(b,a,l,i) !a
cc sum=sum-aijk(a,l,i,k)*ss(c,b,l,j) !d
cc sum=sum-aijk(c,l,k,i)*ss(a,b,l,j) !f
cc sum=sum-aijk(a,l,i,j)*ss(b,c,l,k) !e
cc enddo
ccc w(a,b,c)=sum
ccc write(*,'(6i4,es14.6)') k,j,i,a,b,c,sum
cc enddo
cc enddo
cc enddo
ccC calculating V
cc call dcopy(nvirt**3,w,1,v,1)
ccC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
ccC$OMP& PRIVATE(a)
cc do a=1,nvirt
cc v(a,1:nvirt,1:nvirt)=v(a,1:nvirt,1:nvirt)
cc $ +t(a,i)*abij(1:nvirt,1:nvirt,j,k)
cc enddo
ccC$OMP END PARALLEL DO
ccC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
ccC$OMP& PRIVATE(b)
cc do b=1,nvirt
cc v(1:nvirt,b,1:nvirt)=v(1:nvirt,b,1:nvirt)
cc $ +t(b,j)*abij(1:nvirt,1:nvirt,i,k)
cc enddo
ccC$OMP END PARALLEL DO
ccC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
ccC$OMP& PRIVATE(c)
cc do c=1,nvirt
cc v(1:nvirt,1:nvirt,c)=v(1:nvirt,1:nvirt,c)
cc $ +t(c,k)*abij(1:nvirt,1:nvirt,i,j)
cc enddo
ccC$OMP END PARALLEL DO
cc do a=1,nvirt
cc do b=1,nvirt
cc do c=1,nvirt
cc write(*,'(6i4,2es14.6)') k,j,i,a,b,c,w(a,b,c),v(a,b,c)
cc enddo
cc enddo
cc enddo
cc endif
cc }}}
cC calculating energy
c if(localcc.eq.'off ')then
cc test canonical if(.true.) then
cc write(*,*) 'warning can energy'
cc {{{ canonical occ E formula
c IF(i.eq.j.or.j.eq.k)THEN
c sum=0.d0
cC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
cC$OMP& PRIVATE(a,b,c,delta1,fcb,astart,delta,fcba)
cC$OMP& REDUCTION(+:sum)
c do c=1,nvirt
c do b=c,nvirt
c delta1=1.d0
c if(b.eq.c)delta1=delta1+1.d0
c fcb=fdv(c)+fdv(b)
c if(b.eq.c)then
c astart=b+1
c else
c astart=b
c endif
c do a=astart,nvirt
c delta=delta1
c if(a.eq.b)delta=delta+1.d0
c fcba=fcb+fdv(a)
c if (can) fact=(fkji-fcba)
cc y=v(a,b,c)+v(b,c,a)+v(c,a,b)
cc w1=w(a,b,c)+w(b,c,a)+w(c,a,b)
c sum=sum+(
cc $y*w1
c $-(v(a,b,c)+v(b,c,a)+v(c,a,b))*(w(a,b,c)+w(b,c,a)+w(c,a,b))
c
c $+(w(a,b,c)*v(a,b,c)
c $+w(b,c,a)*v(b,c,a)
c $+w(c,a,b)*v(c,a,b))*3.d0
c
cc $)/((fkji-fcba)*delta) ! canonical
c $)/(delta*fact)
c enddo
c enddo
c enddo
cC$OMP END PARALLEL DO
c et=et-4.d0*sum/delta3 ! 2* in E expr and 2* from symmetry of i.eq.j...
c ELSE
c sum=0.d0
cC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
cC$OMP& PRIVATE(a,b,c,delta1,fcb,astart,delta,fcba,y,z)
cC$OMP& REDUCTION(+:sum)
c do c=1,nvirt
c do b=c,nvirt
c delta1=1.d0
c if(b.eq.c)delta1=delta1+1.d0
c fcb=fdv(c)+fdv(b)
c if(b.eq.c)then
c astart=b+1
c else
c astart=b
c endif
c do a=astart,nvirt
cc do a=b,nvirt
c delta=delta1
c if(a.eq.b)delta=delta+1.d0
c fcba=fcb+fdv(a)
c if (can) fact=(fkji-fcba)
c y=v(a,b,c)+v(b,c,a)+v(c,a,b)
c z=v(a,c,b)+v(b,a,c)+v(c,b,a)
c sum=sum+(
c $+(w(a,b,c)+w(b,c,a)+w(c,a,b))*(y-2.d0*z)
c $+(w(a,c,b)+w(b,a,c)+w(c,b,a))*(z-2.d0*y)
c
c $+(w(a,b,c)*v(a,b,c)
c $+w(b,c,a)*v(b,c,a)
c $+w(c,a,b)*v(c,a,b)
c $+w(a,c,b)*v(a,c,b)
c $+w(b,a,c)*v(b,a,c)
c $+w(c,b,a)*v(c,b,a))*3.d0
c
cc $)/((fkji-fcba)*delta) ! canonical
c $)/(delta*fact)
c enddo
c enddo
c enddo
cC$OMP END PARALLEL DO
c et=et-2.d0*sum/delta3
c ENDIF
cc write(*,'(4i4,2es16.6)') iquad,k,j,i,et,2.d0*sum/delta3
cc }}}
c else !if(localcc.ne.'off ')then
cc {{{ talg=occ local E expression
c sum=0.d0
cC$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
cC$OMP& PRIVATE(a,b,c,astart,y,delta1,delta)
cC$OMP& REDUCTION(+:sum)
c do c=1,nvirt
c do b=c,nvirt
c delta1=1.d0
c if(b.eq.c)delta1=delta1+1.d0
c if(b.eq.c)then
c astart=b+1
c else
c astart=b
c endif
c fcb=fdv(c)+fdv(b)
c do a=astart,nvirt
c delta=delta1
c if(a.eq.b)delta=delta+1.d0
c if (denom) then
c fcba=fcb+fdv(a)
c fact=fcba-fkji ! - due to other - in et=et-sum
c endif
c if (i.eq.j) then
c y=v(a,b,c)+v(a,c,b)+v(b,c,a)
c sum=sum+(
c $ w(a,b,c)*(3.d0*v(a,b,c)-y)+
c $ w(a,c,b)*(3.d0*v(a,c,b)-y)+
c $ w(b,c,a)*(3.d0*v(b,c,a)-y)
c $ )/(delta*fact)
cc re $ )/(3.d0*delta)
c else ! i>j
c sum=sum+(
c $ 2.d0/3.d0*
c $ (w(a,b,c)+w(b,c,a)+w(c,a,b)-w(a,c,b)-w(b,a,c)-w(c,b,a))*
c $ (v(a,b,c)+v(b,c,a)+v(c,a,b)-v(a,c,b)-v(b,a,c)-v(c,b,a))
cc
c $ +w(a,b,c)*(2.d0*v(a,b,c)-v(a,c,b)-v(c,b,a))
c $ +w(b,c,a)*(2.d0*v(b,c,a)-v(a,c,b)-v(b,a,c))
c $ +w(c,a,b)*(2.d0*v(c,a,b)-v(b,a,c)-v(c,b,a))
c $ +w(a,c,b)*(2.d0*v(a,c,b)-v(b,c,a)-v(a,b,c))
c $ +w(b,a,c)*(2.d0*v(b,a,c)-v(c,a,b)-v(b,c,a))
c $ +w(c,b,a)*(2.d0*v(c,b,a)-v(a,b,c)-v(c,a,b))
cc
c $)/(delta*fact)
c endif ! i.eq.j or i>j
c enddo
c enddo
c enddo
cC$OMP END PARALLEL DO
c et=et-sum
c epair(j,i)=epair(j,i)-sum
cc }}}
c endif ! localcc.eq.'off '
ccc TIMEADD(times(1,20),times)
c enddo !i
c prk=prk+nocc-istart+1
c#if defined(MPI)
c if(mpi_rank .ne. mpi_size-1) then
c flag = .true.
c if(nummes .ne. 0) then
c call MPI_Test(request, flag, stat, mpi_err)
c end if
c
c if(.not.flag) then
c call MPI_Cancel(request, mpi_err)
c else
c comm_scr = 0
c end if
c
c comm_scr = comm_scr + prk - prk_sent
c prk_sent = prk
c
c call MPI_Isend(comm_scr, 1, MPI_INTEGER_MRCC_MRCC, mpi_size-1,
c $ 0, ccsd_communicator, request, mpi_err)
c nummes=nummes+1
c else if(mpi_ccsd) then ! last rank receives if ccsd is running MPI parallel
c flag = .true.
c do while(flag)
c call MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG,
c $ ccsd_communicator, flag, stat, mpi_err)
c
c if(flag) then
c call MPI_Recv(comm_scr, 1, MPI_INTEGER_MRCC_MRCC,
c $ stat(MPI_SOURCE), stat(MPI_TAG),
c $ ccsd_communicator,stat, mpi_err)
c prk = prk + comm_scr
c end if
c enddo
c end if
c#endif
c if (.not.abci_inmem.and.j.eq.jfrstm1+actblock*iblocki)
c $ actblock=actblock+1 ! step to the next block, j=1 is in block 0
c TIMEADD(times(1,97),times(1,98))
cc if (iquad.eq.1) then
cc write(*, "('Rank ',i3,' done with j',i4,' of q',i3,
cc $' CPU [min]:',f10.5,2x,'Wall [min]:',7f10.5)")
cc $ mpi_rank,j,iquad,times(1,97)/60.d0,times(2,97)/60.d0 ; flush(6)
cc elseif(iquad.eq.nquad) then
cc write(*, "('Rank ',i3,' done with j of all q ',i4,
cc $' CPU [min]:',f10.5,2x,'Wall [min]:',7f10.5)")
cc $ mpi_rank,j,times(3,97)/60.d0,times(4,97)/60.d0 ; flush(6)
cc else
cc endif
c enddo !j
c enddo !k
c ind=ind+n
c enddo ! iquad
c endif ! (notend) for blocks
cc
c#if defined(MPI)
c if(mpi_ccsd) then
c call MPI_Ibarrier(ccsd_communicator, request, mpi_err)
c TIMEADD(times(1,95),times(1,96))
cc if(mpi_size .gt. 1) then
cc write(*, "('Rank ',i3,' done with (T) ',
cc $'CPU [min]:',f10.3,2x,'Wall [min]:',7f10.3)")
cc $ mpi_rank,times(1,95)/60.d0,times(2,95)/60.d0 ; flush(6)
cc end if
c if(mpi_rank .eq. mpi_size-1) then
c ptdone = .false.
c do while(.not.ptdone)
c flag = .true.
c do while(flag)
c call MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG,
c $ ccsd_communicator, flag, stat, mpi_err)
c if(flag) then
c call MPI_Recv(comm_scr, 1, MPI_INTEGER_MRCC,
c $ stat(MPI_SOURCE), stat(MPI_TAG),
c $ ccsd_communicator, stat, mpi_err)
c prk = prk + comm_scr
c end if
c enddo
c
c pr=dble(prk)/dkji !progress counter
c if(pr.ge.prold+0.1d0.and.pr.ne.1.d0) then
c prold=pr
c write(iout,'(i4,"% done.")')nint(pr*100)
c endif
c
c call MPI_Test(request, ptdone, MPI_STATUS_IGNORE,
c $ mpi_err)
c end do
c end if
c
ccc call MPI_Barrier(ccsd_communicator, mpi_err)
ccc write(*,*) "rank:",mpi_rank,et
ccc call MPI_Barrier(ccsd_communicator, mpi_err)
cc if(mpi_rank .eq. 0) then
cc call MPI_Reduce(MPI_IN_PLACE, et, 1, MPI_DOUBLE_PRECISION,
cc $ MPI_SUM, 0, ccsd_communicator, request, mpi_err)
cc else
cc call MPI_Reduce(et, et, 1, MPI_DOUBLE_PRECISION,
cc $ MPI_SUM, 0, ccsd_communicator, request, mpi_err)
cc end if
c call MPI_Allreduce(MPI_IN_PLACE, et, 1, MPI_DOUBLE_PRECISION,
c $ MPI_SUM, ccsd_communicator, mpi_err)
c end if
c#endif
c if (can) et=et/nquad
cc write(6,'(f17.12)')et
cc call timer
cc print energy contribution of index triples ijk
cc do j=1,nocc
cc do i=1,nocc
cc if (i.lt.j) epair(j,i)=epair(i,j)
cc et2=et
cc if (et2.lt.1.d-15) et2=et2+1.d-15
cc write(112,'(2i5,9es20.12)') j,i,epair(j,i),epair(j,i)/et2,
cc $ fdo(i)+fdo(j)+fdo(1)
cc enddo
cc enddo
cc close(112)
c return
c end subroutine
cC }}}
c
c {{{ localbtcorr2 with loops
************************************************************************
subroutine localbtcorr2(t,tt,fdo,fdv,ijka,abij,
C nincs megkotes k-ra!
$w,v,bsymmv,symmocc,dgroup,first,last,co,ico,
$irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
************************************************************************
implicit none
integer irecln,ibufln,nbasis,nocc,nvirt,dgroup,nirmax
integer bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
integer first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
integer co(nbasis),ico(nbasis)
real*8 t(nvirt,nocc),tt(nocc,nocc,nvirt,nvirt)
real*8 fdv(nvirt),fdo(nocc)
real*8 et,sum,sum2,y
integer a,b,c,d,i,j,k,l,p,q,r,s
real*8,allocatable:: vint(:,:,:,:)
real*8,allocatable:: w2(:,:,:,:,:,:)
real*8,allocatable:: v2(:,:,:,:,:,:)
c real*8,allocatable:: w(:,:,:)
c real*8,allocatable:: v(:,:,:)
real*8 w(nocc,nocc,nocc),v(nocc,nocc,nocc)
real*8 delta
real*8 iabc(nocc,nvirt,nvirt,nvirt)
real*8 abij(nvirt,nvirt,nocc,nocc)
real*8 ijka(nocc,nocc,nocc,nvirt)
integer imem1
common/memcom/imem1
allocate(vint(nbasis,nbasis,nbasis,nbasis))
allocate(w2(nvirt,nvirt,nvirt,nocc,nocc,nocc))
allocate(v2(nvirt,nvirt,nvirt,nocc,nocc,nocc))
c allocate(w(nocc,nocc,nocc))
c allocate(v(nocc,nocc,nocc))
c Read integral list
c read(inp,*) nbasis,nocc
read(555,*)
read(555,*) (i,j=1,nbasis)
read(555,*)
read(555,*) sum,p,q,r,s
do while(r.ne.0)
vint(p,q,r,s)=sum
vint(q,p,r,s)=sum
vint(p,q,s,r)=sum
vint(q,p,s,r)=sum
vint(r,s,p,q)=sum
vint(r,s,q,p)=sum
vint(s,r,p,q)=sum
vint(s,r,q,p)=sum
read(555,*) sum,p,q,r,s
enddo
c if(maxcor-(imem-imem1).lt.nvirt**3)then
c else
c write(iout,*)'Insufficient memory!'
c call mrccend(1)
c endif
call abijread(abij,bsymmv,symmocc,dgroup,first,last,co,w,
$ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
call ijkaread(ijka,bsymmv,symmocc,dgroup,first,last,co,w,
$ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
c call abci4read(1,nocc,iabc,bsymmv,symmocc,dgroup,co,ico,
c $first,last,w)
c call sortabci(bsymmv,symmocc,dgroup,co,ico,first,last,w,
c $maxcor,imem,imem1,dcore,nocc,nvirt,nbasis,iout,nirmax,
c $irecln,ibufln,multpg)
c write(6,*)'d'
c call flush(6)
c call iabcread(nvirt,1,iabc,w,nocc,nvirt,irecln,ibufln)
do i=1,nocc
do c=1,nvirt
do b=1,nvirt
do a=1,nvirt
iabc(i,a,b,c)=vint(c+nocc,a+nocc,b+nocc,i)
c if(iabc(i,a,b,c).ne.iabc2(i,a,b,c))then
c write(6,'(4i3,2f14.10)')i,a,b,c,
c $iabc(i,a,b,c),iabc2(i,a,b,c)
c endif
enddo
enddo
enddo
enddo
c do j=1,nocc
c do i=1,nocc
c do b=1,nvirt
c do a=1,nvirt
c abij(a,b,i,j)=vint(a+nocc,i,b+nocc,j)
c if(abij(a,b,i,j).ne.abij2(a,b,i,j))then
c write(6,'(4i3,2f14.10)')a,b,i,j,
c $abij(a,b,i,j),abij2(a,b,i,j)
c endif
c enddo
c enddo
c enddo
c enddo
c do k=1,nocc
c do j=1,nocc
c do i=1,nocc
c do a=1,nvirt
c ijka(i,j,k,a)=vint(i,k,j,a+nocc)
c if(ijka(i,j,k,a).ne.ijka2(i,j,k,a))then
c write(6,'(4i3,2f14.10)')i,j,k,a,
c $ijka(i,j,k,a),ijka2(i,j,k,a)
c endif
c enddo
c enddo
c enddo
c enddo
C Construct W Eq.4
et=0.d0
do c=1,nvirt
do b=c,nvirt
do a=b,nvirt
do k=1,nocc
do j=1,nocc
do i=1,nocc
sum=0.d0
sum2=0.d0
do d=1,nvirt
sum=sum+iabc(i,d,a,b)*tt(k,j,c,d)
sum=sum+iabc(i,d,a,c)*tt(j,k,b,d)
sum=sum+iabc(k,d,c,a)*tt(j,i,b,d)
sum=sum+iabc(k,d,c,b)*tt(i,j,a,d)
sum=sum+iabc(j,d,b,c)*tt(i,k,a,d)
sum=sum+iabc(j,d,b,a)*tt(k,i,c,d)
enddo
do l=1,nocc
sum=sum-ijka(j,k,l,c)*tt(i,l,a,b)
sum=sum-ijka(k,j,l,b)*tt(i,l,a,c)
sum=sum-ijka(i,j,l,b)*tt(k,l,c,a)
sum=sum-ijka(j,i,l,a)*tt(k,l,c,b)
sum=sum-ijka(k,i,l,a)*tt(j,l,b,c)
sum=sum-ijka(i,k,l,c)*tt(j,l,b,a)
enddo
w2(a,b,c,i,j,k)=sum
w(i,j,k)=sum
c enddo
c enddo
c enddo
c enddo
c enddo
c enddo
C Construct V Eq.5
c do k=1,nocc
c do j=1,nocc
c do i=1,nocc
c do c=1,nvirt
c do b=1,nvirt
c do a=1,nvirt
v(i,j,k)=w(i,j,k)+
$abij(b,c,j,k)*t(a,i)+
$abij(a,c,i,k)*t(b,j)+
$abij(a,b,i,j)*t(c,k)
C denominator into W
w(i,j,k)=w(i,j,k)/(fdo(i)+fdo(j)+fdo(k)-fdv(a)-fdv(b)-fdv(c))
enddo
enddo
enddo
c enddo
c enddo
c enddo
C Calculate Energy
c et=0.d0
c do c=1,nvirt
c do b=c,nvirt
c do a=b,nvirt
sum=0.d0
delta=1.d0
if(a.eq.b)delta=delta+1.d0
if(c.eq.b)delta=delta+1.d0
do k=1,nocc
do j=1,nocc
c sum2=0.d0
c if(i.eq.j)then
y=v(k,j,j)+v(j,k,j)+v(j,j,k)
sum=sum
$+ w(j,j,k)*(3.d0*v(j,j,k)-y)
$+ w(j,k,j)*(3.d0*v(j,k,j)-y)
$+ w(k,j,j)*(3.d0*v(k,j,j)-y)
c endif
c sum=sum+sum2/(fdo(j)+fdo(j)+fdo(k)-fdv(a)-fdv(b)-fdv(c))
c sum=sum+sum2
do i=j+1,nocc
c sum2=0.d0
c if(i.gt.j)then
sum=sum+
$ 2.d0/3.d0*
$ (w(i,j,k)+w(j,k,i)+w(k,i,j)-w(i,k,j)-w(j,i,k)-w(k,j,i))*
$ (v(i,j,k)+v(k,i,j)+v(j,k,i)-v(i,k,j)-v(j,i,k)-v(k,j,i))
$+ w(i,j,k)*(2.d0*v(i,j,k)-v(i,k,j)-v(k,j,i))
$+ w(j,k,i)*(2.d0*v(j,k,i)-v(k,j,i)-v(j,i,k))
$+ w(k,i,j)*(2.d0*v(k,i,j)-v(j,i,k)-v(i,k,j))
$+ w(i,k,j)*(2.d0*v(i,k,j)-v(i,j,k)-v(k,i,j))
$+ w(j,i,k)*(2.d0*v(j,i,k)-v(k,i,j)-v(j,k,i))
$+ w(k,j,i)*(2.d0*v(k,j,i)-v(j,k,i)-v(i,j,k))
c endif
c sum=sum+sum2/(fdo(i)+fdo(j)+fdo(k)-fdv(a)-fdv(b)-fdv(c))
c sum=sum+sum2
enddo
enddo
enddo
et=et+sum/delta
enddo
enddo
enddo
c et=et/3.d0
write(6,'("(T)en: ",f17.12)')et
return
end subroutine
C }}}
c
c {{{ btcorr
************************************************************************
subroutine btcorr(t0,tt,fdo,fdv,aijk,abij,wa,va,
$bsymmv,symmocc,dgroup,first,last,co,et,times,rr,rrd,
$ccsdalg,jij,jai0,jab0,dfnb,jlen,Iabija,
$nquad, quad, tl, abc1, abci, ttla, ss, abci_inmem, cmo, localcc,
$talg,lljab, lljpai, Apbc, nblocki, iblocki, tmpvsize, ttl2a,ttl3a,
$lapl,irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg,
$iout,maxcor,imem,dcore,iimem,icore,scr,mpi_rank,mpi_size,mpi_ccsd,
$ccsdmkl, ptthreads, omp_max_threads, mkl_max_threads,
$ccsdrest, ptrest, bcast_comm,ccsd_communicator, lno, lnaf, ets,
$master_rank,master_thread,lf12,tfact)
************************************************************************
#if defined(MPI)
c use mpi
c use counter
use manager
#endif
implicit none
#if defined(MPI)
include "mpif.h"
integer stat(MPI_STATUS_SIZE), bcast_comm
integer :: comm_scr
#else
integer stat, bcast_comm, MPI_STATUS_IGNORE, comm_scr
#endif
integer ptthreads, omp_max_threads, mkl_max_threads
integer dfnb, master_rank
integer irecln,ibufln,nbasis,nocc,nvirt,dgroup,nirmax
integer bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
integer iout, maxcor, imem, kjiind
real*8 dcore(*)
real*8 quad(nvirt+nocc, nquad), tl(nvirt, nocc)
real*8 abci(nvirt, nvirt, nvirt, *),
$ abc1(nvirt, nvirt, nvirt)
real*8 cmo(nocc, nocc)
real*8 ttla(nvirt,nvirt,nocc,1)
real*8 ss(nvirt,nvirt,nocc,nocc),fb
real*8 ttl2a(nvirt,nvirt,1,nocc),
$ ttl3a(nvirt,nvirt,ptthreads)
real*8 lljab(dfnb,nvirt,nvirt)
real*8 tmpvsize(*)
real*8 lljpai(dfnb,nvirt,nocc),Apbc(dfnb,nvirt,nvirt)
integer icore(*), iimem
integer nquad, nblocki, iblocki
real*8, pointer :: ttl(:,:,:,:), ttl2(:,:,:,:), ttl3(:,:),
$ ttl4(:,:)
c
character*3 ccsdmkl
character*4 ccsdrest, localcc, talg
logical mpi_ccsd, ptrest, abci_inmem, master_thread, lf12
integer mpi_rank, mpi_size, mpi_err, request
integer npr
integer blocknumber, bsize, kk, prk_sent
real*8 jij(dfnb, nocc, nocc), jab0(dfnb, nvirt, nvirt)
real*8 jai0(nvirt, nocc, dfnb) ! originally in iap order
character*8 ccsdalg
real*8 scr(nocc, nvirt), scr1o(nocc), scr1v(nvirt)
real*8, pointer :: abc_ofi(:,:,:), abc_ofj(:,:,:), abc_ofk(:,:,:)
real*8, pointer :: abc_k(:,:,:,:)
logical flag,ptdone, lapl, new_k, iinbl, localccXp
integer nummes, jmax, iquad
real*8, pointer :: t(:,:), jai(:,:,:), jab(:,:,:)
integer rrd,jlen
real*8 t0(nvirt,nocc),tt(nvirt,nvirt,nocc,nocc),fi,fj,fij,fbij
real*8 rr(nvirt,nvirt,nocc,rrd,*)
real*8, pointer :: rrp(:,:,:,:)
real*8 fdv(nvirt),fdo(nocc), fc
real*8 et,esum,delta,delta1,delta2,delta3,y,z,esum2,w1,w2
real*8 fkj,fkji,fcb,fcba,deltaij,fact,et2,epair(nocc,nocc)
integer a,b,c,i,j,k, k2, j2, i2
real*8 wa(nvirt,nvirt,nvirt,*),va(nvirt,nvirt,nvirt,*)
real*8, pointer :: w(:,:,:), v(:,:,:)
real*8 abij(nvirt,nvirt,nocc,jlen) !<12|12>
real*8 aijk(nvirt,nocc,nocc,nocc) !<12|12>
real*8 pr,prold,dkji
real*8 IabIJa(nvirt, nvirt, ptthreads)
real*8, pointer :: IabIJ(:,:)
integer co(nbasis)
integer first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
logical notend,abijinmem, abckinmem, abcjinmem, abciinmem, rstex
logical kdone, jdone(nocc), kjdone
logical*1 :: kjidone
integer ind,n,li, ttd
integer imem1,istart,astart,kinrr,prk,kji, jinrr
common/memcom/imem1
c debug
real*8 times(6,0:100), times0(2)
c real*8 kjinum(nocc)
! MPI declaration{{{
integer lk, nn
#ifdef MPI
integer :: counter_own
integer, pointer :: requests(:), ijk_indices(:), toSend(:)
integer, pointer :: requestsAll(:)
real*8, pointer :: energies(:)
#else
integer :: counter_own
integer :: requests, ijk_indices, toSend
integer :: requestsAll
real*8 :: energies
#endif
integer :: ccsd_communicator
integer :: counter_k, counter_win
integer :: ifree, reqptr, nReqs, nReqsAll
integer ijk, reqused, nReqs0, findex, lindex
integer ijk3(3*ptthreads), req3(3*ptthreads), toSend3(3*ptthreads)
real*8 en3(3*ptthreads)!}}}
c real*8, pointer :: w1(:), w2(:), x(:), y1(:), z1(:)
integer iok
real*8 tmp, denom, ei(nocc)
integer omp_get_num_threads, omp_get_thread_num
integer nthreade, mkl_threads, istat,
$ mkl_set_num_threads_local
integer :: omp_num_threads, omp_thread_num
integer dblalloc, kmax
integer iabci, iabcj, iabck
integer count_k
integer iijk, iene, ireq, itos, ijk_tot, intalloc, ijk_index
integer mpi_blocklen(2), mpi_displ(2), mpi_types(2)
integer mpi_type_e
real*8, pointer :: qfact(:)
integer iqfact
real*8 ecabs, emp2f12, tfact, ets
logical lno, lnaf
interface!{{{
c {{{ interfaces for pointers
subroutine rpoint1d(egydim,haromdim,dim1)
implicit none
integer dim1
real*8,target :: egydim(dim1)
real*8, pointer :: haromdim(:)
end subroutine
subroutine rpoint2d(egydim,haromdim,dim1,dim2)
implicit none
integer dim1,dim2
real*8,target :: egydim(dim1,dim2)
real*8, pointer :: haromdim(:,:)
end subroutine
subroutine rpoint3d(egydim,haromdim,dim1,dim2,dim3)
implicit none
integer dim1,dim2,dim3
real*8,target :: egydim(dim1,dim2,dim3)
real*8, pointer :: haromdim(:,:,:)
end subroutine
subroutine rpoint4d(egydim,haromdim,dim1,dim2,dim3,dim4)
implicit none
integer dim1,dim2,dim3,dim4
real*8,target :: egydim(dim1,dim2,dim3,dim4)
real*8, pointer :: haromdim(:,:,:,:)
end subroutine
subroutine ipoint1d(egydim1,egydim2,dim1)
implicit none
integer dim1
integer, target :: egydim1(dim1)
integer, pointer :: egydim2(:)
end subroutine
c}}}
subroutine get_abci(i, abc_ofi, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis, nocc, nvirt,
$ nirmax, multpg, ccsdalg, dfnb, jab, jai, dcore, imem, n,
$ addr, ccsdmkl)
implicit none
integer :: i, nvirt, nocc, nbasis, dfnb, imem, n, addr
integer :: irecln, ibufln, nirmax, dgroup
integer :: bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
real*8 :: va(nvirt,nvirt,nvirt,*)
real*8 :: dcore(*)
integer :: co(nbasis)
integer :: first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
real*8, pointer :: abc_ofi(:, :, :)
real*8 :: jai(nvirt, nocc, dfnb), jab(dfnb, nvirt, nvirt)
character*8 :: ccsdalg
character*3 :: ccsdmkl
end subroutine
end interface!}}}
if((lno.or.lnaf.or.lf12).and.trim(localcc).eq.'off') then
iqfact = dblalloc(nocc)
call rpoint1d(dcore(iqfact), qfact, nocc)
open(800, file = 'F12INTE', form = 'unformatted')
rewind(800)
read(800) ecabs, emp2f12
read(800) tfact, qfact
close(800)
end if
#ifdef MPI
c initialize arrays for MPI restart!{{{
if(ccsdrest .ne. 'off') then
ifree = maxcor - (imem-imem1)
ijk_tot = ijk_index(nocc-1, nocc, nocc, nocc)
if(ifree .lt. ijk_tot * 4) then
write(*, '(a, i3)') ' Insufficient memory on rank ',
$ mpi_rank
call mrccend(1)
end if
nReqs = ijk_tot
nReqs0 = ijk_tot
nReqsAll = ijk_tot
iijk = intalloc(ijk_tot)
iene = dblalloc(ijk_tot)
ireq = intalloc(ijk_tot)
itos = intalloc(ijk_tot)
call ipoint1d(icore(iijk), ijk_indices, nReqs)
call rpoint1d(dcore(iene), energies, nReqs)
call ipoint1d(icore(ireq), requestsAll, nReqs)
call ipoint1d(icore(ireq), requests, nReqs)
call ipoint1d(icore(itos), toSend, nReqs)
requests = MPI_REQUEST_NULL
c if(ifree .ge. 4*3*ptthreads) then
c ifree = min(ifree, 4096)
c nReqs = ifree/3 !4
c
c call ipoint1d(icore(iimem+n*nvirt**3), requests, nReqs)
c call ipoint1d(icore(iimem+n*nvirt**3+nReqs), ijk_indices,
c $ nReqs)
c call rpoint1d(dcore(iimem+n*nvirt**3+2*nReqs), energies,
c $ nReqs)
c
c else
c nReqs = 3*ptthreads
c
c call ipoint1d(req3, requests, nReqs)
c call ipoint1d(ijk3, ijk_indices, nReqs)
c call rpoint1d(en3, energies, nReqs)
c end if
c
c nReqs0 = nReqs
c nReqs = nReqs / ptthreads
c requests = MPI_REQUEST_NULL
end if!}}}
#endif
! initialization {{{
reqused = 0
nummes = 0
#if defined(MPI)
comm_scr = 0
prk_sent = 0
#endif
et=0.d0
ets=0.d0
et2=0.d0
ind=1
notend=.true.
prk=0
pr=0.d0
prold=0.d0
epair=0.d0
ei = 0.d0
lapl = talg.eq.'lapl' .or. talg.eq.'topr' .or. talg.eq.'to '
$ .or. talg.eq.'lato'
if(.not.lapl) then
nquad = 1
ttd = nocc
else
ttd = 1
end if
if(ptthreads .eq. 1) then
nthreade = omp_max_threads
else
nthreade = omp_max_threads / ptthreads
end if
if(ccsdmkl .eq. 'thr') then
mkl_threads = mkl_max_threads / ptthreads
else
mkl_threads = 1
end if
if(lapl) then ! .and. localcc.ne.'off ') then
call rpoint3d(abc1, abc_ofk, nvirt, nvirt, nvirt)
call rpoint3d(abci, abc_ofj, nvirt, nvirt, nvirt)
if(iblocki+1 .lt. nocc) then
n = iblocki - 2 ! 1 for abc_ofj
nn = n + 2 ! abcl's, abc1 and abcj
call rpoint4d(abci(1, 1, 1, 2), abc_k,
$ nvirt, nvirt, nvirt, n)
else
n = nocc
nn = n
call rpoint4d(abc1, abc_k, nvirt, nvirt, nvirt, nocc)
end if
else
n = blocknumber(nvirt**3, 1, nocc, 'y', 1, bsize,
$ 'Insufficient memory for <ab|ci> block! ',
$ maxcor - (imem-imem1))
nn = n
if(n .lt. min(nocc, 1 + ptthreads * 2)) then
write(iout, *) 'Error: insufficient memory in btcorr'
call mrccend(1)
end if
if(n .ne. nocc) then
c abckinmem = .true.
c abcjinmem = .true.
c abciinmem = .true.
c if(n .lt. 1 + 2 * ptthreads) then
c if(n .lt. 1 + ptthreads) then
c if(n .lt. 1) then
c abckinmem = .false.
c else
c n = n - 1
c call rpoint3d(dcore(imem+n*nvirt**3), abc_ofk,
c $ nvirt, nvirt, nvirt)
c end if
c abcjinmem = .false.
c else
c n = n - 1 - ptthreads
c call rpoint3d(dcore(imem+n*nvirt**3), abc_ofk,
c $ nvirt, nvirt, nvirt)
c call rpoint3d(dcore(imem+(n+1)*nvirt**3), abc_ofj,
c $ nvirt, nvirt, nvirt)
c end if
c abciinmem = .false.
c else
n = n - 1 - 2 * ptthreads
iabck = dblalloc(nvirt**3)
call rpoint3d(dcore(iabck), abc_ofk, nvirt, nvirt, nvirt)
iabcj = dblalloc(ptthreads * nvirt**3)
call rpoint3d(dcore(iabcj), abc_ofj, nvirt, nvirt, nvirt)
iabci = dblalloc(ptthreads * nvirt**3)
call rpoint3d(dcore(iabci), abc_ofi, nvirt, nvirt, nvirt)
if(n .gt. 0)
$ call rpoint4d(dcore(imem), abc_k, nvirt, nvirt, nvirt, n)
c end if
else
call rpoint4d(dcore(imem), abc_k, nvirt, nvirt, nvirt, nocc)
end if
end if
c all kji combination:
npr = nocc - n
kji = 0
if(localcc.ne.'off ' .and. talg.eq.'lapl') then
kmax = 1
else
kmax = nocc
end if
do k = 1, kmax
do i = k+1, nocc ! j = k
kji = kji + 1
end do
do j = k+1, nocc ! j>k
do i = j, nocc
kji = kji + 1
end do
end do
enddo
dkji = dble(kji) * nquad
#if defined(MPI)
if(mpi_ccsd) then
if(mpi_size .gt. nocc .and. master_thread) then !mpi_rank .eq. 0) then
write(6, '(" Warning: only ", i3,
$ " MPI threads were utilized for (T)"/)') nocc
end if
if(mpi_size .gt. 1) then
c call create_counter(ccsd_communicator, counter_k)
call create_manager(ccsd_communicator, master_rank)
call allocate_counter(ccsd_communicator, count_k)
end if
end if
#endif
if(ccsdalg.eq.'dfdirect' .and. localcc.eq.'off ') then
! jai contains Jia, transpose it
do k = 1, dfnb
call dcopy(nvirt*nocc, jai0(1, 1, k), 1, scr, 1)
do i = 1, nocc
call dcopy(nvirt, scr(i, 1), nocc, jai0(1, i, k), 1)
end do
end do
end if
if(lapl) then
quad = dsqrt(quad)
call rpoint2d(tl, t, nvirt, nocc)
call rpoint4d(ttla, ttl, nvirt, nvirt, nocc, 1)
call rpoint4d(ttl2a, ttl2, nvirt, nvirt, 1, nocc)
c call rpoint2d(ttl3a, ttl3, nvirt, nvirt)
c call rpoint2d(IabIJ, ttl4, nvirt, nvirt)
else
call rpoint2d(t0, t, nvirt, nocc)
call rpoint4d(tt, ttl, nvirt, nvirt, nocc, nocc)
call rpoint4d(tt, ttl2, nvirt, nvirt, nocc, nocc)
c call rpoint2d(tt0(1:nvirt, 1:nvirt, 1, 1), ttl3,
c nvirt, nvirt)
c call rpoint2d(tt0(1:nvirt, 1:nvirt, 1, 1), ttl4,
c nvirt, nvirt)
end if
if(lapl .and. localcc.ne.'off ') then
call rpoint3d(lljpai, jai, nvirt, nocc, dfnb)
call rpoint3d(lljab, jab, dfnb, nvirt, nvirt)
else
call rpoint3d(jai0, jai, nvirt, nocc, dfnb)
call rpoint3d(jab0, jab, dfnb, nvirt, nvirt)
end if
if(lapl .and. localcc.ne.'off ' .and. ccsdalg.eq.'disk ')
$ ccsdalg = 'dfdirect'
abijinmem=.true.
if (jlen.eq.1.and.(ccsdalg.eq.'dfdirect'.or.
$ localcc.ne.'off '))abijinmem=.false.
!}}}
c create and open restart file!{{{
if(ccsdrest .ne. 'off') then
inquire(file = 'pt.rst', exist = rstex)
c if(mpi_rank .eq. 0 .and. (.not. ptrest .or. .not.rstex))
if(master_thread .and. (.not. ptrest .or. .not.rstex))
$ call zero_rst_pt(nocc)
#ifdef MPI
call bcast_file('pt.rst', bcast_comm)
if(mpi_size .gt. 1) call MPI_Barrier(ccsd_communicator,mpi_err)
#endif
open(unit = 725, file = 'pt.rst', access = 'stream')
end if!}}}
write(iout, *)
if(talg .eq. 'occ ') then
write(iout,'(f12.2," MB memory is utilized for (T)"/)')
$ (imem+n*nvirt**3-imem1)*8.d0/1024**2
else
write(iout,'(f12.2," MB memory is utilized for (T)"/)')
$ (imem-imem1)*8.d0/1024**2
end if
c write(iout,'(" 1% done.")')
call progress_bar(iout, 0.d0, .true.)
do iquad = 1, nquad
new_k = .true.
if(lapl) then
c multiply with Laplace factors
if(localcc.eq.'off ')then
c {{{ localcc.eq.'off ' case: FOR TEST PURPOSES, not efficient
do i=1,nocc
fi=quad(i,iquad)
t(1:nvirt,i)=t0(1:nvirt,i)*quad(nocc+1:nocc+nvirt,iquad)*fi
enddo
c lt1=t
c call dgemm('n','n',nvirt,nocc,nocc,1.d0,lt1,nvirt,cmo,nocc,
c $ 0.d0,t,nvirt)
C read integrals to memory
if(ccsdalg .eq. 'disk ') then
call abijread(abij,bsymmv,symmocc,dgroup,first,last,co,va,
$ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
call aijkread(aijk,bsymmv,symmocc,dgroup,first,last,co,va,
$ irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
else
! (ai|bj) = <ab|ij>
call dgemm('n', 't', nvirt*nocc, nvirt*nocc, dfnb,
$ 1.d0, jai, nvirt*nocc, jai, nvirt*nocc,
$ 0.d0, abij, nvirt*nocc)
! aibj -> abij
do j = 1, nocc
do a = 1, nvirt
call dcopy(nocc*nvirt, abij(a, 1, 1, j), nvirt, scr,1)
do b = 1, nvirt
abij(a, b, 1:nocc, j) = scr(1:nocc, b)
end do
end do
end do
! (aj|ik) = <ai|jk>
call dgemm('n', 'n', nvirt*nocc, nocc**2, dfnb,
$ 1.d0, jai, nvirt*nocc, jij, dfnb,
$ 0.d0, aijk, nvirt*nocc)
! ajik -> aijk
do k = 1, nocc
do a = 1, nvirt
do j = 1, nocc
scr1o(j+1:nocc) = aijk(a, j+1:nocc, j, k)
aijk(a, j+1:nocc, j, k) = aijk(a, j, j+1:nocc, k)
aijk(a, j, j+1:nocc, k) = scr1o(j+1:nocc)
end do
end do
end do
end if
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(j,fj,i,fij,b,fbij)
do j=1,nocc
fj=quad(j,iquad)
do i=1,nocc
fij=quad(i,iquad)*fj
do b=1,nvirt
fbij=quad(nocc+b,iquad)*fij
abij(1:nvirt,b,i,j)=abij(1:nvirt,b,i,j)*
$ quad(nocc+1:nocc+nvirt,iquad)*fbij
enddo
enddo
enddo
C$OMP END PARALLEL DO
do j=1,nocc
fj=quad(j,iquad)
if (iquad.gt.1) fj=fj/quad(j,iquad-1)
do i=1,nocc
fij=quad(i,iquad)*fj
if (iquad.gt.1) fij=fij/quad(i,iquad-1)
do b=1,nvirt
do a=1,nvirt
fbij=fij*quad(nocc+b,iquad)*quad(nocc+a,iquad)
if (iquad.gt.1)
$ fbij=fbij/quad(nocc+a,iquad-1)/quad(nocc+b,iquad-1)
tt(a,b,i,j)= tt(a,b,i,j)*fbij ! ss is in tt
enddo
enddo
enddo
enddo
if(ccsdalg .eq. 'disk ') then
c call abciread(1,nvirt,abci,bsymmv,symmocc,dgroup,co,
c $first,last,v,
c $irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
call abciread(1,nocc,abc_k,bsymmv,symmocc,dgroup,co,
$ first,last,va,irecln,ibufln,nbasis,nocc,nvirt,
$ nirmax,multpg)
else
! (ac|bi) = <ab|ci>
call dgemm('t', 't', nvirt**2, nvirt*nocc, dfnb,
$ 1.d0, jab, dfnb, jai, nvirt*nocc,
$ 0.d0, abc_k, nvirt**2)
! acbi -> abci
do i = 1, nocc
do a = 1, nvirt
do b = 1, nvirt
scr1v(b+1:nvirt) = abc_k(a, b+1:nvirt, b, i)
abc_k(a,b+1:nvirt,b,i) = abc_k(a,b,b+1:nvirt,i)
abc_k(a, b, b+1:nvirt, i) = scr1v(b+1:nvirt)
end do
end do
end do
end if
do c=1,nvirt
fc=quad(nocc+c,iquad)
do b=1,nvirt
fj=quad(nocc+b,iquad)*fc
do a=1,nvirt
fij=quad(nocc+a,iquad)*fj
do i=1,nocc
abc_k(a,b,c,i)=abc_k(a,b,c,i)*quad(i,iquad)*fij
enddo
enddo
enddo
enddo
do j=1,nocc
fj=quad(j,iquad)
do i=1,nocc
fij=quad(i,iquad)*fj
do k=1,nocc
do a=1,nvirt
aijk(a,k,i,j)=aijk(a,k,i,j)*fij*
$ quad(nocc+a,iquad)/quad(k,iquad)
enddo
enddo
enddo
enddo
c }}}
else ! if localcc=on
c multiply with Laplace factors and trf to the Mayer basis: (parts of) T2, abci, abij and aijk
call trf2laplbasis(tt, ttl, jij, jab0, jai0, nocc, nvirt,
$ dfnb, abij, abc_ofk, aijk, quad, iquad, nquad, cmo,
$ aijk, jai, jab, rr, rr, wa, wa, aijk, va, t, t, t0, talg,
$ fdo, va, wa, abci_inmem, ttl2, wa, wa, IabIJa)
! Jpai -> Jaip
call gentrans(jai, 1, dfnb, nvirt*nocc, nvirt*nocc*dfnb,
$ tmp, dcore(imem), (dfnb+nvirt*nocc)/2, iok)
if(iok .ne. 0) then
write(*,*) 'Error while reordering Jai in btcorr'
call mrccend(1)
end if
end if
endif ! lapl
c
c transe tt(a,b,i,k) to rr(b,a,i,k) for all k
TIME0(times)
if (rrd.eq.nocc) then
do k=1,nocc
do i=1,nocc
call tr(nvirt,tt(1,1,i,k),rr(1,1,i,k,1))
enddo
enddo
endif
TIMEADD(times(1,25),times)
if(talg .eq. 'occ ') then
C get <ab|ij> and <ai|jk> integrals !{{{
if(ccsdalg .eq. 'disk ') then
if(abijinmem)
$ call abijread(abij,bsymmv,symmocc,dgroup,first,last,co,
$ va,irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
call aijkread(aijk,bsymmv,symmocc,dgroup,first,last,co,
$ va,irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
else if(ccsdalg .eq. 'dfdirect') then
! (ai|bj) = <ab|ij>
if (abijinmem) then
call dgemm('n', 't', nvirt*nocc, nvirt*nocc, dfnb,
$ 1.d0, jai, nvirt*nocc, jai, nvirt*nocc,
$ 0.d0, abij, nvirt*nocc)
! aibj -> abij
do j = 1, nocc
do a = 1, nvirt
call dcopy(nocc*nvirt,abij(a,1,1,j),nvirt,scr,1)
do b = 1, nvirt
abij(a, b, 1:nocc, j) = scr(1:nocc, b)
end do
end do
end do
endif ! abijinmem
! (aj|ik) = <ai|jk>
call dgemm('n', 'n', nvirt*nocc, nocc**2, dfnb,
$ 1.d0, jai, nvirt*nocc, jij, dfnb,
$ 0.d0, aijk, nvirt*nocc)
! ajik -> aijk
do k = 1, nocc
do a = 1, nvirt
do j = 1, nocc
scr1o(j+1:nocc) = aijk(a, j+1:nocc, j, k)
aijk(a,j+1:nocc,j,k) = aijk(a,j,j+1:nocc,k)
aijk(a, j, j+1:nocc, k) = scr1o(j+1:nocc)
end do
end do
end do
end if !}}}
end if
TIME0(times)
if (.not.(lapl .and. localcc.eq.'off ') .and. n.ge.1) then
c get <ab|ci> integrals !{{{
if(ccsdalg .eq. 'disk ') then
call abciread(nocc-n+1, n, abc_k, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis, nocc,
$ nvirt, nirmax, multpg)
else if(ccsdalg .eq. 'dfdirect') then
! (ac|bk) = <ab|ck>
call dgemm('t', 't', nvirt**2, nvirt*n, dfnb,
$ 1.d0, jab, dfnb, jai(1, nocc-n+1, 1), nvirt*nocc,
$ 0.d0, abc_k, nvirt**2)
! acbk -> abck
do k = 1, n
do a = 1, nvirt
do b = 1, nvirt
scr1v(b+1:nvirt) = abc_k(a, b+1:nvirt, b, k)
abc_k(a, b+1:nvirt, b, k) =
$ abc_k(a, b, b+1:nvirt, k)
abc_k(a, b, b+1:nvirt, k) = scr1v(b+1:nvirt)
end do
end do
end do
end if
end if !}}}
TIMEADD(times(1,12),times)
kk = 0 ! kk=k for canonical, kk=j for local
do ! loop over index k for canonical or index j for local calculation
TIME0(times(1,99))
#if defined(MPI)
if(mpi_size .gt. 1) then
call get_and_increment_counter(counter_own, 1,
$ count_k)
kk = counter_own
if(k .gt. nocc+mpi_size .or. k .le. 0) then
write(*, '("WARNING: Counter in rank ", i4,
$ " out of range in (T): ", i25)') mpi_rank, kk
c call free_manager(ccsd_communicator, counter_k)
call deallocate_counter(ccsd_communicator, count_k)
call free_manager(ccsd_communicator)
call mrccend(1)
end if
else
#endif
kk = kk + 1
#if defined(MPI)
end if
#endif
if(kk .gt. nocc) exit
if(localcc .eq. 'off ') then
k = kk
else
k = 1
end if
if(ptrest) then
kdone = .true.
do j = k, nocc
if(j .eq. k) then
istart = j + 1
else
istart = j
endif
kjdone = .true.
do i = istart, nocc
call read_rst_pt(k, j, i, nocc, kjidone, esum)
if(kjidone) then
et = et + 2.d0 * esum
if(lno.or.lnaf.or.lf12.and.trim(localcc).eq.'off')
$ ets=ets+2*esum*(qfact(i)+qfact(j)+qfact(k))/3
prk = prk + 1
else
kdone = .false.
kjdone = .false.
end if
end do
jdone(j) = kjdone
end do
if(kdone) goto 1235
end if
ind = nocc - n + 1
if(new_k) then
c get <ab|ck> integrals for one k !{{{
TIME0(times)
if(k .ge. ind) then
lk = k - ind + 1
call rpoint3d(abc_k(1:nvirt, 1:nvirt, 1:nvirt, k - ind + 1),
$ abc_ofk, nvirt, nvirt, nvirt)
else !if(abckinmem) then
if(lapl) then ! .and. localcc.ne.'off ') then
call rpoint3d(abc1, abc_ofk, nvirt, nvirt, nvirt)
else
call rpoint3d(dcore(iabck), abc_ofk, nvirt, nvirt, nvirt)
end if
c if(ccsdalg .eq. 'disk ') then
c call abciread(k, 1, abc_ofk, bsymmv, symmocc, dgroup,
c $ co, first, last, va, irecln, ibufln, nbasis,
c $ nocc, nvirt, nirmax, multpg)
c
c else if(ccsdalg .eq. 'dfdirect') then
c ! (ac|bk) = <ab|ck>
c call dgemm('t', 't', nvirt**2, nvirt, dfnb,
c $ 1.d0, jab, dfnb, jai(1, k, 1), nvirt*nocc,
c $ 0.d0, abc_ofk, nvirt**2)
c
c ! acbk -> abck
c do a = 1, nvirt
c do b = 1, nvirt
c scr1v(b+1:nvirt) = abc_ofk(a, b+1:nvirt, b)
c abc_ofk(a, b+1:nvirt, b) =
c $ abc_ofk(a, b, b+1:nvirt)
c abc_ofk(a, b, b+1:nvirt) = scr1v(b+1:nvirt)
c end do
c end do
c
c end if
c write(*,*) "assemble abck",k
call get_abci(k, abc_ofk, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis, nocc,
$ nvirt, nirmax, multpg, ccsdalg, dfnb, jab, jai, dcore,
$ imem, n, iabck, ccsdmkl) !iabck not correct
end if
TIMEADD(times(1,12),times)!}}}
c
c transpose tt(a,b,i,k) to rr(b,a,i) for k
if(ccsdmkl .eq. 'thr') then
if (rrd.eq.1) then
do i=1,nocc
call tr(nvirt,tt(1,1,i,k),rr(1,1,i,1,1))
enddo
kinrr = 1
else
kinrr = k
endif
end if
c
if (.not.abijinmem .and. localcc.eq.'off ') then
c assembly <ab|jk> for k in abj order, full <ab|jk> did not fit in memory
do j=k,nocc
call dgemm('n','t',nvirt,nvirt,dfnb,
$ 1.d0,jai(1,j,1), nvirt*nocc,jai(1,k,1),nvirt*nocc,
$ 0.d0,abij(1,1,j,1),nvirt)
enddo
endif
c {{{ localcc.eq.'off ' case: FOR TEST PURPOSES, not efficient
if(localcc.eq.'off ' .and. lapl) then
if (talg.eq.'lapl'.or.talg.eq.'lato') then
c make ttl(-,~,i,k) & ttl2(-,~,k,i) from ss(-,-,-.-)
do i=1,nocc
do b=1,nvirt
fb=quad(nocc+b,iquad)**2
ttl (1:nvirt,b,i,1)=tt(1:nvirt,b,i,k)/fb
ttl2(1:nvirt,b,1,i)=tt(1:nvirt,b,k,i)/fb
enddo
enddo
endif
end if
c }}}
if(localcc .ne. 'off ') new_k = .false.
end if
if(localcc.ne.'off ') then
jmax = kk
else
jmax = nocc
end if
c
#if defined(OMP)
if(ptthreads .gt. 1) call omp_set_max_active_levels(2)
#endif
c$OMP PARALLEL DEFAULT(NONE) REDUCTION(+:et,et2,ets)
c$OMP& SHARED(dkji,prold,iout,ind,abc_k,ccsdrest,nReqs0,lf12,
c$OMP& requestsAll,omp_max_threads,ptthreads,ccsdmkl,rrd,nvirt,nocc,
c$OMP& IabIJa,lapl,ttl3a,tt,k,jmax,mpi_rank,mpi_size,pr,prk,iquad,quad,
c$OMP& iabci,ptrest,fdo,abc_ofk,n,imem,jai,ccsdalg,dfnb,jab,iabcj,
c$OMP& bsymmv,symmocc,dgroup,co,first,last,t,irecln,ibufln,nbasis,
c$OMP& nirmax,multpg,kinrr,abij,aijk,times,talg,ttl,ttl2,abijinmem,
c$OMP& localcc,ccsd_communicator,MPI_STATUS_IGNORE,ttd,kk,fdv,stat,
c$OMP& abci_inmem,tmpvsize,Apbc,jij,prk_sent,nummes,epair,request,flag,
c$OMP& mkl_max_threads,jdone,master_thread,master_rank,requests,
c$OMP& ijk_indices,energies,toSend,nReqsAll,
c$OMP& nReqs,lno,lnaf,qfact,ei) !,mpi_blocklen,mpi_displ,mpi_types,
cc$OMP& mpi_type_e)
c$OMP& PRIVATE(rrp,j,abc_ofj,scr1v,jinrr,esum,c,b,delta1,delta2,fkj,
c$OMP& istart,i,li,abc_ofi,delta3,fkji,fcb,astart,delta,fcba,y,z,w,v,
c$OMP& times0,kjiind,kjidone,k2,j2,i2,ijk,reqptr,deltaij,
c$OMP& mpi_err,omp_num_threads,omp_thread_num,IabIJ,
c$OMP& ttl3,ttl4,fb,istat,findex,lindex,denom,iinbl,esum2)
c$OMP& FIRSTPRIVATE(nthreade,mkl_threads,comm_scr)
c$OMP& NUM_THREADS(ptthreads)
c initialization of thread private variables!{{{
#ifdef OMP
omp_thread_num = omp_get_thread_num()
omp_num_threads = omp_get_num_threads()
#else
omp_thread_num = 0
omp_num_threads = 1
#endif
#ifdef MPI
c initialize arrays for MPI restart!{{{
c if(ccsdrest .ne. 'off ') then
cc if(omp_thread_num .eq. 0) then
cc findex = 1
cc lindex = nReqs0/omp_num_threads
cc
cc nReqsAll = nReqs0
cc nReqs = nReqs0/omp_num_threads
cc
cc call ipoint1d(requestsAll, requests, nReqsAll)
cc
cc else
c findex = omp_thread_num*nReqs0/omp_num_threads+1
c lindex = min((omp_thread_num+1)*nReqs0/omp_num_threads,
c $ nReqs0)
cc lindex = nReqs0
c
cc nReqs = nReqs0*(omp_num_threads-1)/
cc $ omp_num_threads
cc nReqs = min(nReqs0/omp_num_threads
cc $ nReqs0-omp_thread_num*nReqs0/omp_num_threads)
c nReqs = lindex - findex +1
c nReqsAll = nReqs
cc write(*,*) 'majom ', findex, lindex, nReqs
c
c call ipoint1d(requestsAll(findex:lindex), requests,
c $ nReqs)
c call ipoint1d(ijk_indices(findex:lindex), ijk_indices,
c $ nReqs)
c call rpoint1d(energies(findex:lindex), energies,
c $ nReqs)
cc call ipoint1d(toSend(findex:lindex), toSend, nReqs)
cc end if
c end if
cc$OMP SINGLE
c call MPI_Get_address(energies, addr_e, mpi_err)
c mpi_blocklen(1) = 1
c mpi_displ(1) = 0
c mpi_types(1) = MPI_DOUBLE_PRECISION
c call MPI_Get_address(ijk_indices, mpi_displ(2), mpi_err)
c mpi_blocklen(2) = 1
c mpi_displ(2) = mpi_displ(2) - addr_e
c mpi_types(2) = MPI_INTEGER
c call MPI_Type_create_struct(2, mpi_blocklen,
c $ mpi_displ, mpi_types, mpi_type_e, mpi_err)
c call MPI_Type_commit(mpi_type_e, mpi_err)
cc$OMP END SINGLE
#else
nReqsAll = 1
nReqs = 1 !}}}
#endif
if(omp_thread_num .lt.
$ mod(omp_max_threads, ptthreads)) then
if(ccsdmkl .eq. 'thr') mkl_threads = mkl_threads + 1
if(ptthreads .ne. 1) nthreade = nthreade + 1
end if
#ifdef MKL
istat = mkl_set_num_threads_local(int(mkl_threads,4))
#endif
if(rrd.eq.1 .and. ccsdmkl.eq.'seq' .and. localcc.eq.'off ')then
call rpoint4d(rr(1, 1, 1, 1, omp_thread_num+1),
$ rrp, nvirt, nvirt, nocc, rrd)
else
call rpoint4d(rr, rrp, nvirt, nvirt, nocc, rrd)
end if
call rpoint3d(wa(1, 1, 1, omp_thread_num+1),
$ w, nvirt, nvirt, nvirt)
call rpoint3d(va(1, 1, 1, omp_thread_num+1),
$ v, nvirt, nvirt, nvirt)
call rpoint2d(IabIJa(1, 1, omp_thread_num+1), IabIJ,
$ nvirt, nvirt)
if(lapl) then
call rpoint2d(ttl3a(1, 1, omp_thread_num+1), ttl3,
$ nvirt, nvirt)
call rpoint2d(IabIJ, ttl4, nvirt, nvirt)
c call rpoint2d(IabIJa(1, 1, omp_thread_num+1), ttl4,
c $ nvirt, nvirt)
else
call rpoint2d(tt(1:nvirt,1:nvirt,1,1), ttl3, nvirt, nvirt)
call rpoint2d(tt(1:nvirt,1:nvirt,1,1), ttl4, nvirt, nvirt)
end if
!}}}
do j = kk, jmax
if(omp_thread_num .eq. 0) then
if(master_thread) then !mpi_rank .eq. 0) then
pr = dble(prk) / dkji
else
pr = dble(prk) / dkji * mpi_size
end if
if(pr .ge. prold+0.1d0 .and. pr.lt.1.d0) then
prold = pr
c write(iout, '(i4, "% done.")') nint(pr * 100)
endif
call progress_bar(iout, pr, .false.)
end if
if(ptrest .and. jdone(j)) goto 1234
c assemble <ab|cj> {{{
if(j .ge. ind) then
call rpoint3d(abc_k(1:nvirt, 1:nvirt, 1:nvirt, j-ind+1),
$ abc_ofj, nvirt, nvirt, nvirt)
else if(j .eq. k) then
call rpoint3d(abc_ofk, abc_ofj, nvirt, nvirt, nvirt)
else if(j .ne. k) then ! .and. abcjinmem) then
TIME0(times0)
if(localcc .eq. 'off ') then
call rpoint3d(dcore(iabcj + omp_thread_num*nvirt**3),
$ abc_ofj, nvirt, nvirt, nvirt)
c if(ccsdalg .eq. 'disk ') then
c call abciread(j, 1, abc_ofj, bsymmv, symmocc, dgroup,
c $ co, first, last, v, irecln, ibufln, nbasis,
c $ nocc, nvirt, nirmax, multpg)
c
c else
c ! (ac|bj) = <ab|cj>
c call dgemm_wabc('t', 't', nvirt**2, nvirt, dfnb,
c $ 1.d0, jab, dfnb, jai(1, j, 1), nvirt*nocc,
c $ 0.d0, abc_ofj, nvirt**2, ccsdmkl, 1, nvirt)
c
c ! acbj -> abcj
c do a = 1, nvirt
c do b = 1, nvirt
c scr1v(b+1:nvirt) = abc_ofj(a, b+1:nvirt, b)
c abc_ofj(a, b+1:nvirt, b) =
c $ abc_ofj(a, b, b+1:nvirt)
c abc_ofj(a, b, b+1:nvirt) = scr1v(b+1:nvirt)
c end do
c end do
c end if
c write(*,*) "assemble abcj",j
call get_abci(j, abc_ofj, bsymmv, symmocc, dgroup,
$ co, first, last, v, irecln, ibufln, nbasis,
$ nocc, nvirt, nirmax, multpg, ccsdalg, dfnb,
$ jab, jai, dcore, imem, n, iabcj +
$ omp_thread_num * nvirt**3,
$ ccsdmkl)
else
call rpoint3d(abci, abc_ofj, nvirt, nvirt, nvirt)
c$OMP SINGLE
#if defined(OMP)
if(ptthreads .gt. 1) call omp_set_max_active_levels(3)
#endif
#ifdef MKL
istat = mkl_set_num_threads_local(int(omp_max_threads,4))
#endif
call get_abci(j, abc_ofj, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis,
$ nocc, nvirt, nirmax, multpg, ccsdalg, dfnb,
$ jab, jai, dcore, imem, n, iabcj,
$ ccsdmkl)
#ifdef MKL
istat = mkl_set_num_threads_local(int(mkl_threads,4))
#endif
#if defined(OMP)
if(ptthreads .gt. 1) call omp_set_max_active_levels(2)
#endif
c$OMP END SINGLE
end if
TIMEADD(times(1,12),times0)
end if ! }}}
! Apbc for localcc small mem!{{{
if(.not.abci_inmem .and. lapl) then !localcc.ne.'off ') then
c$OMP SINGLE
#if defined(OMP)
if(ptthreads .gt. 1) call omp_set_max_active_levels(3)
#endif
#ifdef MKL
istat = mkl_set_num_threads_local(int(omp_max_threads,4))
#endif
call dgemm('n', 't', dfnb*nvirt, nvirt, nvirt,
$ 1.d0, jab, dfnb*nvirt, ttl(1,1,j,k), nvirt,
$ 0.d0, Apbc, dfnb*nvirt)
if (j .ne. k) then ! j=k is taken care of inside wabc in the j=k case
c Ajk(P, b, c)=Ajk(P, c, b)
do b = 1, nvirt
do c = b+1, nvirt
tmpvsize(1:dfnb) = Apbc(1:dfnb, c, b)
Apbc(1:dfnb, c, b) = Apbc(1:dfnb, b, c)
Apbc(1:dfnb, b, c) = tmpvsize(1:dfnb)
end do
end do
c Ajk(P, b, c)=Ajk(P, b, c)+lljab(P, \bar b, \bar d)*ttl(\bar c, \tilde d, \bar k, \bar j)^T
call dgemm('n', 't', dfnb*nvirt, nvirt, nvirt,
$ 1.d0, jab, dfnb*nvirt, ttl2(1, 1, k, j), nvirt,
$ 1.d0, Apbc, dfnb*nvirt)
end if
#ifdef MKL
istat = mkl_set_num_threads_local(int(mkl_threads,4))
#endif
#if defined(OMP)
if(ptthreads .gt. 1) call omp_set_max_active_levels(2)
#endif
c$OMP END SINGLE
end if!}}}
if(ccsdmkl.eq.'seq' .and. rrd.eq.1) then
if(localcc .eq. 'off ') then
do i=1,nocc
call tr(nvirt, tt(1,1,i,j), rrp(:,:,i,1))
enddo
else
c$OMP DO SCHEDULE(DYNAMIC)
do i=1,nocc
call tr(nvirt, tt(1,1,i,j), rrp(:,:,i,1))
enddo
c$OMP END DO
end if
jinrr = 1
else
jinrr = j
end if
delta2 = 1.d0
fkj = fdo(k) + fdo(j)
if(j .eq. k) then !i=j=k => no contribution!
istart = j + 1
delta2 = delta2 + 1.d0
else
istart = j
endif
c$OMP DO
c$OMP& SCHEDULE(DYNAMIC)
cc$OMP& REDUCTION(+:et,ets)
do i=istart,nocc
if(ptrest) then
call read_rst_pt(k, j, i, nocc, kjidone, esum)
if(kjidone) then
c et = et + 2.d0 * esum
goto 123
end if
end if
c assemble <ab|ci> {{{
iinbl = .true.
if(i .ge. ind) then
call rpoint3d(abc_k(1:nvirt,1:nvirt,1:nvirt,i-ind+1),
$ abc_ofi, nvirt, nvirt, nvirt)
else if(i .eq. j) then ! .and. abciinmem)then
call rpoint3d(abc_ofj, abc_ofi, nvirt, nvirt, nvirt)
else if(localcc .eq. 'off ') then ! canonical small mem
TIME0(times0)
call rpoint3d(dcore(iabci +
$ omp_thread_num*nvirt**3), abc_ofi,
$ nvirt, nvirt, nvirt)
c if(ccsdalg .eq. 'disk ') then
c call abciread(i, 1, abc_ofi, bsymmv, symmocc,
c $ dgroup, co, first, last, v, irecln, ibufln,
c $ nbasis, nocc, nvirt, nirmax, multpg)
c
c else
c ! (ac|bi) = <ab|ci>
c call dgemm_wabc('t', 't', nvirt**2, nvirt, dfnb,
c $ 1.d0, jab, dfnb, jai(1, i, 1), nvirt*nocc,
c $ 0.d0, abc_ofi, nvirt**2, ccsdmkl, 1, nvirt)
c
c ! acbi -> abci
c do a = 1, nvirt
c do b = 1, nvirt
c scr1v(b+1:nvirt) = abc_ofi(a, b+1:nvirt, b)
c abc_ofi(a, b+1:nvirt, b) =
c $ abc_ofi(a, b, b+1:nvirt)
c abc_ofi(a, b, b+1:nvirt) = scr1v(b+1:nvirt)
c end do
c end do
c end if
c write(*,*) "assemble abci",i
call get_abci(i, abc_ofi, bsymmv, symmocc, dgroup,
$ co, first, last, v, irecln, ibufln, nbasis,
$ nocc, nvirt, nirmax, multpg, ccsdalg, dfnb,
$ jab, jai, dcore, imem, n, iabci +
$ omp_thread_num*nvirt**3,
$ ccsdmkl)
TIMEADD(times(1,12),times0)
else if(localcc .ne. 'off ') then
iinbl = .false.
call rpoint3d(abc_k(1:nvirt,1:nvirt,1:nvirt,1),
$ abc_ofi, nvirt, nvirt, nvirt)
end if ! }}}
delta3 = delta2
if(i .eq. j) delta3 = delta3 + 1.d0
deltaij = 1.d0
if(i.eq.j) deltaij = deltaij + 1
fkji = fkj + fdo(i)
c {{{ localcc.eq.'off ' case: FOR TEST PURPOSES, not efficient
if (talg.eq.'lapl'.or.talg.eq.'lato') then
c make ttl3(-,~,i,j) & ttl4(-,~,j,i) from ss(-,-,-,-)
do b=1,nvirt
fb=quad(nocc+b,iquad)**2
ttl3(1:nvirt,b)=tt(1:nvirt,b,i,j)/fb
IabIJ(1:nvirt,b)=tt(1:nvirt,b,j,i)/fb ! ttl4
enddo
else if(lapl) then
call dcopy(nvirt**2,tt(1,1,i,j),1,ttl3,1)
call dcopy(nvirt**2,tt(1,1,j,i),1,IabIJ,1) ! ttl4
end if!}}}
c compute W and V for all abc indices
TIME0(times0)
call wabc(nocc, nvirt, i, j, k, imem, dcore, t, ttl, rrp,
$ rrd, abc_ofi, abc_ofj, abc_ofk, kinrr, jinrr, w, v,
$ abij, aijk, times, talg, tt, iinbl, localcc,
$ IabIJ, jai, dfnb, ttl2, ttl3, ttl4, ttd, Apbc,
$ abijinmem, jai, ccsdmkl, ptthreads, omp_max_threads) !,
c $ abckinmem, abcjinmem, abciinmem)
TIMEADD(times(1,27),times0)
C calculating energy
TIME0(times0)
if(localcc .eq. 'off ') then !{{{
IF(i.eq.j.or.j.eq.k)THEN
esum=0.d0
C$OMP PARALLEL DO Schedule(Dynamic) Default(NONE)
C$OMP& SHARED(nvirt,fdv,fkji,w,v,talg)
C$OMP& PRIVATE(a,b,c,delta1,fcb,astart,delta,fcba,denom)
C$OMP& REDUCTION(+:esum) NUM_THREADS(nthreade)
do c=1,nvirt
do b=c,nvirt
delta1=1.d0
if(b.eq.c)delta1=delta1+1.d0
fcb=fdv(c)+fdv(b)
if(b.eq.c)then
astart=b+1
else
astart=b
endif
do a=astart,nvirt
delta=delta1
if(a.eq.b)delta=delta+1.d0
fcba=fcb+fdv(a)
if(talg.eq.'occ ') then
denom = (fkji-fcba)*delta
else
denom = -delta
end if
esum=esum+(
$-(v(a,b,c)+v(b,c,a)+v(c,a,b))*(w(a,b,c)+w(b,c,a)+w(c,a,b))
$+(w(a,b,c)*v(a,b,c)
$+w(b,c,a)*v(b,c,a)
$+w(c,a,b)*v(c,a,b))*3.d0
$)/denom
enddo
enddo
enddo
C$OMP END PARALLEL DO
fb = 4.d0*esum/delta3
ei(j) = ei(j) + fb
et=et+fb !4.d0*esum/delta3
if(lno.or.lnaf.or.lf12.and.trim(localcc).eq.'off')
$ ets = ets + fb*(qfact(i)+qfact(j)+qfact(k))/3
if(ccsdrest .ne. 'off') call saveToRst(nReqs, requests,
$ ijk_indices, energies, toSend, 2.d0*esum/delta3,
$ k, j, i, nocc, ccsd_communicator, nReqsAll,
$ ccsd_communicator, master_rank) !, mpi_type_e)
ELSE
c ! w1, w2, x, y1, z1 elore {{{
c call rpoint1d(dcore(imem+(n+2)*nvirt**3), w1,
c $ nvirt*(nvirt+1)/2-1)
c call rpoint1d(dcore(imem+(n+2)*nvirt**3+nvirt*(nvirt+1)/2-1),
c $ w2, nvirt*(nvirt+1)/2-1)
c call rpoint1d(dcore(imem+(n+2)*nvirt**3+
c $ 2*(nvirt*(nvirt+1)/2-1)), x, nvirt*(nvirt+1)/2-1)
c call rpoint1d(dcore(imem+(n+2)*nvirt**3+
c $ 3*(nvirt*(nvirt+1)/2-1)), y1, nvirt*(nvirt+1)/2-1)
c call rpoint1d(dcore(imem+(n+2)*nvirt**3+
c $ 4*(nvirt*(nvirt+1)/2-1)), z1, nvirt*(nvirt+1)/2-1)
c
c esum = 0.d0
c do c = 1, nvirt-1 ! a=b=c => no contribution
c TIME0(times(1,94))
ccc$OMP PARALLEL DO DEFAULT(PRIVATE)
ccc$OMP& SHARED(w1,w2,x,y1,z1,w,v,fdv,nvirt,c)
cc$OMP PARALLEL DO DEFAULT(SHARED)
cc$OMP& PRIVATE(b,fcb,astart,delta1,ind1,ind2,delta,fcba,abc)
cc$OMP& SCHEDULE(DYNAMIC)
c do b = c, nvirt
c fcb = fdv(c) + fdv(b)
c if(b .eq. c) then
c astart = b + 1
c delta1 = 2.d0
c else
c astart = b
c delta1 = 1.d0
c endif
c
c ind1 = (nvirt - c + 1) * (nvirt - c + 2) / 2 -
c $ (nvirt - b + 1) * (nvirt - b + 2) / 2
c if(b .eq. c) ind1 = ind1 + 1
c ind2 = ind1 + nvirt - astart
c
c w1(ind1:ind2) = w(astart:nvirt,b,c)
c $ - 2.0 * w(b,astart:nvirt,c)
c w2(ind1:ind2) = w(b,astart:nvirt,c)
c $ - 2.0 * w(astart:nvirt,b,c)
c x(ind1:ind2) = w(astart:nvirt,b,c) * v(astart:nvirt,b,c)
c $ + w(b,astart:nvirt,c) * v(b,astart:nvirt,c)
c y1(ind1:ind2) = v(astart:nvirt,b,c)
c z1(ind1:ind2) = v(b,astart:nvirt,c)
c
c w1(ind1:ind2) = w1(ind1:ind2) + w(c,astart:nvirt,b)
c $ - 2.0 * w(astart:nvirt,c,b)
c w2(ind1:ind2) = w2(ind1:ind2) + w(astart:nvirt,c,b)
c $ - 2.0 * w(c,astart:nvirt,b)
c x(ind1:ind2) = x(ind1:ind2) +
c $ w(c,astart:nvirt,b) * v(c,astart:nvirt,b)
c $ + w(astart:nvirt,c,b) * v(astart:nvirt,c,b)
c y1(ind1:ind2) = y1(ind1:ind2) + v(c,astart:nvirt,b)
c z1(ind1:ind2) = z1(ind1:ind2) + v(astart:nvirt,c,b)
c
c w1(ind1:ind2) = w1(ind1:ind2) + w(b,c,astart:nvirt)
c $ - 2.0 * w(c,b,astart:nvirt)
c w2(ind1:ind2) = w2(ind1:ind2) + w(c,b,astart:nvirt)
c $ - 2.0 * w(b,c,astart:nvirt)
c x(ind1:ind2) = x(ind1:ind2) +
c $ w(b,c,astart:nvirt) * v(b,c,astart:nvirt)
c $ + w(c,b,astart:nvirt) * v(c,b,astart:nvirt)
c y1(ind1:ind2) = y1(ind1:ind2) + v(b,c,astart:nvirt)
c z1(ind1:ind2) = z1(ind1:ind2) + v(c,b,astart:nvirt)
c
cc w1(ind1:ind2) = w(astart:nvirt,b,c)
cc w2(ind1:ind2) = - 2.0 * w(astart:nvirt,b,c)
cc y1(ind1:ind2) = v(astart:nvirt,b,c)
cc x(ind1:ind2) = w(astart:nvirt,b,c) * v(astart:nvirt,b,c)
cc
cc w1(ind1:ind2) = w1(ind1:ind2) - 2.0 * w(b,astart:nvirt,c)
cc w2(ind1:ind2) = w2(ind1:ind2) + w(b,astart:nvirt,c)
cc z1(ind1:ind2) = v(b,astart:nvirt,c)
cc x(ind1:ind2) = x(ind1:ind2) +
cc $ w(b,astart:nvirt,c) * v(b,astart:nvirt,c)
cc
cc
cc w1(ind1:ind2) = w1(ind1:ind2) + w(c,astart:nvirt,b)
cc w2(ind1:ind2) = w2(ind1:ind2) - 2.0 * w(c,astart:nvirt,b)
cc y1(ind1:ind2) = y1(ind1:ind2) + v(c,astart:nvirt,b)
cc x(ind1:ind2) = x(ind1:ind2) +
cc $ w(c,astart:nvirt,b) * v(c,astart:nvirt,b)
cc
cc w1(ind1:ind2) = w1(ind1:ind2) - 2.0 * w(astart:nvirt,c,b)
cc w2(ind1:ind2) = w2(ind1:ind2) + w(astart:nvirt,c,b)
cc z1(ind1:ind2) = z1(ind1:ind2) + v(astart:nvirt,c,b)
cc x(ind1:ind2) = x(ind1:ind2) +
cc $ w(astart:nvirt,c,b) * v(astart:nvirt,c,b)
cc
cc
cc w1(ind1:ind2) = w1(ind1:ind2) + w(b,c,astart:nvirt)
cc w2(ind1:ind2) = w2(ind1:ind2) - 2.0 * w(b,c,astart:nvirt)
cc y1(ind1:ind2) = y1(ind1:ind2) + v(b,c,astart:nvirt)
cc x(ind1:ind2) = x(ind1:ind2) +
cc $ w(b,c,astart:nvirt) * v(b,c,astart:nvirt)
cc
cc w1(ind1:ind2) = w1(ind1:ind2) - 2.0 * w(c,b,astart:nvirt)
cc w2(ind1:ind2) = w2(ind1:ind2) + w(c,b,astart:nvirt)
cc z1(ind1:ind2) = z1(ind1:ind2) + v(c,b,astart:nvirt)
cc x(ind1:ind2) = x(ind1:ind2) +
cc $ w(c,b,astart:nvirt) * v(c,b,astart:nvirt)
c
c
c ! abc = sum_k=v-b^v-c+1 (k) - 1
c abc = ind1 - 1
cc write(*,'(7i3)') c, b, ind1, ind2, astart, nvirt, abc
cc abc = (nvirt - c + 1) * (nvirt - c + 2) / 2 -
cc $ (nvirt - b) * (nvirt - b + 1) / 2 - 1
c do a = astart, nvirt
c if(a .eq. b) then
c delta = delta1 + 1.d0
c else
c delta = delta1
c end if
c fcba = fcb + fdv(a)
c abc = abc + 1
c x(abc) = x(abc) / (fkji - fcba) / delta
c y1(abc) = y1(abc) / (fkji - fcba) / delta
c z1(abc) = z1(abc) / (fkji - fcba) / delta
c enddo
c enddo
cc$OMP END PARALLEL DO
c TIMEADD(times(1,65),times(1,94))
cc write(*,*) 'c:',c
cc write(*,*) 'x', x
cc write(*,*) 'y', y1
cc write(*,*) 'z', z1
cc write(*,*) 'm:',m, abc
c
c m = (nvirt - c + 1) * (nvirt - c + 2) / 2 - 1
cc write(*,*) "m=",m
c
c db = 1024
c TIME0(times(1,94))
cc$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(l) REDUCTION(+:esum)
cc$OMP& SCHEDULE(DYNAMIC)
c do l = 1, ceiling(m/dble(db))
c db1 = min(m-(l-1)*db, db)
c esum=esum+ddot(db1,y1((l-1)*db+1),1,w1((l-1)*db+1),1)+
c $ ddot(db1,z1((l-1)*db+1),1,w2((l-1)*db+1),1)+
c $ 3.d0 * sum(x((l-1)*db+1:min(l*db,m)))
c end do
cc$OMP END PARALLEL DO
c TIMEADD(times(1,66),times(1,94))
c enddo !}}}
esum=0.d0
C$OMP PARALLEL DO Schedule(Dynamic) Default(NONE)
C$OMP& SHARED(nvirt,fdv,fkji,w,v,talg)
C$OMP& PRIVATE(a,b,c,astart,y,delta1,delta,fcb,fcba,z,denom)
C$OMP& REDUCTION(+:esum) NUM_THREADS(nthreade)
do c=1,nvirt
do b=c,nvirt
delta1=1.d0
if(b.eq.c)delta1=delta1+1.d0
fcb=fdv(c)+fdv(b)
if(b.eq.c)then
astart=b+1
else
astart=b
endif
do a=astart,nvirt
delta=delta1
if(a.eq.b)delta=delta+1.d0
fcba=fcb+fdv(a)
y=v(a,b,c)+v(b,c,a)+v(c,a,b)
z=v(a,c,b)+v(b,a,c)+v(c,b,a)
if(talg .eq. 'occ ') then
denom = (fkji-fcba)*delta
else
denom = -delta
end if
esum=esum+(
$+(w(a,b,c)+w(b,c,a)+w(c,a,b))*(y-2.d0*z)
$+(w(a,c,b)+w(b,a,c)+w(c,b,a))*(z-2.d0*y)
$+(w(a,b,c)*v(a,b,c)
$+w(b,c,a)*v(b,c,a)
$+w(c,a,b)*v(c,a,b)
$+w(a,c,b)*v(a,c,b)
$+w(b,a,c)*v(b,a,c)
$+w(c,b,a)*v(c,b,a))*3.d0
$)/denom
enddo
enddo
enddo
C$OMP END PARALLEL DO
fb = 2.d0*esum/delta3
ei(i) = ei(i) + fb
et=et+fb !2.d0*esum/delta3
if(lno.or.lnaf.or.lf12.and.trim(localcc).eq.'off')
$ ets = ets + fb*(qfact(i)+qfact(j)+qfact(k))/3
if(ccsdrest .ne. 'off') call saveToRst(nReqs, requests,
$ ijk_indices, energies, toSend, esum/delta3,
$ k, j, i, nocc, ccsd_communicator, nReqsAll,
$ ccsd_communicator, master_rank) !, mpi_type_e)
ENDIF ! }}}
else ! talg=occ local E expression {{{
if (localccXp(localcc,2022)) then
c if (.true.) then
esum=0.d0
esum2=0.d0
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(a,b,c,astart,y,delta1,delta,fcb,fcba,fact,w1,w2,z)
C$OMP& REDUCTION(+:esum,esum2) NUM_THREADS(nthreade)
do c=1,nvirt
do b=c,nvirt
delta1=1.d0
if(b.eq.c)delta1=delta1+1.d0
if(b.eq.c)then
astart=b+1
else
astart=b
endif
fcb=fdv(c)+fdv(b)
do a=astart,nvirt
delta=delta1
if(a.eq.b)delta=delta+1.d0
if (talg.eq.'topr'.or.talg.eq.'to ') then
fcba=fcb+fdv(a)
fact=fcba-fkji ! - due to other '- sign' in et=et-sum
else
fact = 1.d0
endif
w1=w(a,b,c)+w(b,c,a)+w(c,a,b)
w2=w(a,c,b)+w(c,b,a)+w(b,a,c)
y=v(a,b,c)+v(b,c,a)+v(c,a,b)
z=v(a,c,b)+v(b,a,c)+v(c,b,a)
esum=esum+(3.d0-delta)*(3.d0-deltaij)*(
$ w1*(y-2.d0*z)+w2*(z-2.d0*y)+3.d0*(
$ +w(a,b,c)*v(a,b,c)+w(b,c,a)*v(b,c,a)
$ +w(c,a,b)*v(c,a,b)+w(a,c,b)*v(a,c,b)
$ +w(b,a,c)*v(b,a,c)+w(c,b,a)*v(c,b,a))
$ )/(fact*6.d0)
c old expression
if (i.eq.j) then
y=v(a,b,c)+v(a,c,b)+v(b,c,a)
esum2=esum2+(
$ w(a,b,c)*(3.d0*v(a,b,c)-y)+
$ w(a,c,b)*(3.d0*v(a,c,b)-y)+
$ w(b,c,a)*(3.d0*v(b,c,a)-y)
$ )/(delta*fact)
else ! i>j
esum2=esum2+(
$ 2.d0/3.d0*
$ (w(a,b,c)+w(b,c,a)+w(c,a,b)-w(a,c,b)-w(b,a,c)-w(c,b,a))*
$ (v(a,b,c)+v(b,c,a)+v(c,a,b)-v(a,c,b)-v(b,a,c)-v(c,b,a))
c
$ + w(a,b,c)*(2.d0*v(a,b,c)-v(a,c,b)-v(c,b,a))
$ + w(b,c,a)*(2.d0*v(b,c,a)-v(a,c,b)-v(b,a,c))
$ + w(c,a,b)*(2.d0*v(c,a,b)-v(b,a,c)-v(c,b,a))
$ + w(a,c,b)*(2.d0*v(a,c,b)-v(b,c,a)-v(a,b,c))
$ + w(b,a,c)*(2.d0*v(b,a,c)-v(c,a,b)-v(b,c,a))
$ + w(c,b,a)*(2.d0*v(c,b,a)-v(a,b,c)-v(c,a,b))
c
$ )/(delta*fact)
endif ! i.eq.j or i>j
enddo
enddo
enddo
C$OMP END PARALLEL DO
et2=et2-esum2
else
c {{{ laplace lno-(t) energy expression < 2022
esum=0.d0
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(a,b,c,astart,y,delta1,delta,fcb,fcba,fact)
C$OMP& REDUCTION(+:esum) NUM_THREADS(nthreade)
do c=1,nvirt
do b=c,nvirt
delta1=1.d0
if(b.eq.c)delta1=delta1+1.d0
if(b.eq.c)then
astart=b+1
else
astart=b
endif
fcb=fdv(c)+fdv(b)
do a=astart,nvirt
delta=delta1
if(a.eq.b)delta=delta+1.d0
if (talg.eq.'topr'.or.talg.eq.'to ') then
fcba=fcb+fdv(a)
fact=fcba-fkji ! - due to other - in et=et-sum
else
fact = 1.d0
endif
c old expression
if (i.eq.j) then
y=v(a,b,c)+v(a,c,b)+v(b,c,a)
esum=esum+(
$ w(a,b,c)*(3.d0*v(a,b,c)-y)+
$ w(a,c,b)*(3.d0*v(a,c,b)-y)+
$ w(b,c,a)*(3.d0*v(b,c,a)-y)
$ )/(delta*fact)
else ! i>j
esum=esum+(
$ 2.d0/3.d0*
$ (w(a,b,c)+w(b,c,a)+w(c,a,b)-w(a,c,b)-w(b,a,c)-w(c,b,a))*
$ (v(a,b,c)+v(b,c,a)+v(c,a,b)-v(a,c,b)-v(b,a,c)-v(c,b,a))
c
$ + w(a,b,c)*(2.d0*v(a,b,c)-v(a,c,b)-v(c,b,a))
$ + w(b,c,a)*(2.d0*v(b,c,a)-v(a,c,b)-v(b,a,c))
$ + w(c,a,b)*(2.d0*v(c,a,b)-v(b,a,c)-v(c,b,a))
$ + w(a,c,b)*(2.d0*v(a,c,b)-v(b,c,a)-v(a,b,c))
$ + w(b,a,c)*(2.d0*v(b,a,c)-v(c,a,b)-v(b,c,a))
$ + w(c,b,a)*(2.d0*v(c,b,a)-v(a,b,c)-v(c,a,b))
c
$ )/(delta*fact)
endif ! i.eq.j or i>j
enddo
enddo
enddo
C$OMP END PARALLEL DO
endif
et=et-esum
epair(j,i)=epair(j,i)-esum
c }}}
end if ! }}}
123 continue
#ifdef MPI
if(mpi_size .gt. 1 .and. master_thread) then !mpi_rank .eq. 0) then
#ifdef OMP
if(omp_thread_num .eq. 0) then
#endif
do
call MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG,
$ ccsd_communicator, flag, stat, mpi_err)
if(.not.flag) exit
if(stat(MPI_TAG) .eq. 0) then
call MPI_Recv(comm_scr, 1, MPI_INTEGER_MRCC,
$ stat(MPI_SOURCE), stat(MPI_TAG),
$ ccsd_communicator, MPI_STATUS_IGNORE,
$ mpi_err)
prk = prk + comm_scr
else if(ccsdrest .ne. 'off ' .and.
$ stat(MPI_TAG) .eq. 1) then
c call MPI_Recv(esum, 1, MPI_DOUBLE_PRECISION,
c $ stat(MPI_SOURCE), stat(MPI_TAG),
c $ ccsd_communicator, MPI_STATUS_IGNORE,
c $ mpi_err)
c call MPI_Recv(energies, 1, mpi_type_e,
c $ stat(MPI_SOURCE), 1,
c $ ccsd_communicator, MPI_STATUS_IGNORE,
c $ mpi_err)
c write(*,*) 'call write_rst_pt1 comm ',ccsd_communicator
c call MPI_Recv(ijk_indices, 1, MPI_INTEGER_MRCC,
call MPI_Recv(ijk, 1, MPI_INTEGER_MRCC,
$ stat(MPI_SOURCE), 1,
$ ccsd_communicator, MPI_STATUS_IGNORE,
$ mpi_err)
c call MPI_Recv(energies, 1, MPI_DOUBLE_PRECISION,
call MPI_Recv(esum, 1, MPI_DOUBLE_PRECISION,
$ stat(MPI_SOURCE), 1,
$ ccsd_communicator, MPI_STATUS_IGNORE,
$ mpi_err)
c ijk = ijk_indices(1)
c esum = energies(1)
c write(*,*) 'call write_rst_pt1 with ',ijk, esum
c if(ijk.le.0) write(*,*) 'error, ', ijk, ' from ', stat(MPI_SOURCE)
call write_rst_pt(ijk, esum)
end if
end do
#ifdef OMP
end if
#endif
end if
#endif
TIMEADD(times(1,20),times0)
enddo ! i
c$OMP END DO NOWAIT
c$OMP SINGLE
prk = prk + nocc-istart+1
c$OMP END SINGLE NOWAIT
1234 continue
TIME0(times0)
c communicate progress {{{
#if defined(MPI)
if(.not.master_thread) then
if(omp_thread_num .eq. 0) then
flag = .true.
if(nummes .ne. 0) then
call MPI_Test(request, flag, stat, mpi_err)
end if
if(flag) then
comm_scr = 0
end if
comm_scr = comm_scr + prk - prk_sent
prk_sent = prk
if(flag) then
call MPI_Isend(comm_scr, 1, MPI_INTEGER_MRCC,
$ master_rank,0, ccsd_communicator, request, mpi_err)
nummes = nummes + 1
end if
end if
c else if(mpi_size .gt. 1) then ! first rank receives if ccsd is running MPI parallel
c flag = .true.
c do while(flag)
c call MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG,
c $ ccsd_communicator, flag, stat, mpi_err)
c
c if(flag) then
c call MPI_Recv(comm_scr, 1, MPI_INTEGER_MRCC,
c $ stat(MPI_SOURCE), stat(MPI_TAG),
c $ ccsd_communicator, stat, mpi_err)
c prk = prk + comm_scr
c end if
c end do
end if
#endif
c }}}
TIMEADD(times(1,28),times0)
end do ! j
#ifdef MKL
istat = mkl_set_num_threads_local(int(mkl_max_threads,4))
#endif
c$OMP END PARALLEL
1235 continue
c communicate progress {{{
#if defined(MPI)
c if(mpi_rank .ne. 0) then
if(.not.master_thread) then
if(omp_thread_num .eq. 0) then
flag = .true.
if(nummes .ne. 0) then
call MPI_Test(request, flag, stat, mpi_err)
end if
if(flag) then
comm_scr = 0
end if
comm_scr = comm_scr + prk - prk_sent
prk_sent = prk
if(flag) then
call MPI_Isend(comm_scr, 1, MPI_INTEGER_MRCC,
$ master_rank,0, ccsd_communicator, request, mpi_err)
nummes = nummes + 1
end if
end if
c else if(mpi_size .gt. 1) then ! first rank receives if ccsd is running MPI parallel
c flag = .true.
c do while(flag)
c call MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG,
c $ ccsd_communicator, flag, stat, mpi_err)
c
c if(flag) then
c call MPI_Recv(comm_scr, 1, MPI_INTEGER_MRCC,
c $ stat(MPI_SOURCE), stat(MPI_TAG),
c $ ccsd_communicator, stat, mpi_err)
c prk = prk + comm_scr
c end if
c end do
end if
#endif
c }}}
#ifdef MKL
call mkl_set_num_threads(mkl_max_threads)
!$OMP PARALLEL NUM_THREADS(ptthreads)
istat = mkl_set_num_threads_local(int(mkl_max_threads,4))
!$OMP END PARALLEL
#endif
#if defined(OMP)
call omp_set_max_active_levels(1)
#endif
TIMEADD(times(1,98),times(1,99))
if(WRITE_TIMES)
$ write(*, "('Rank ',i3,' done with k ',i3,
$ ' CPU [min]:',f10.5,2x,'Wall [min]:',7f10.5)")
$ mpi_rank,kk,times(1,98)/60.d0,times(2,98)/60.d0
end do ! k, kk is incremented
#ifdef MPI
if(nquad .ne. 1 .and. mpi_size .gt. 1)
$ call reset_counter(ccsd_communicator, count_k,
$ ccsd_communicator)
#endif
end do ! for q
TIME0(times)
TIME0(times(1,93))
#if defined(MPI)
if(ccsdrest .ne. "off ") nReqs = nReqs0
if(mpi_rank .ne. 0 .and. ccsdrest .ne. 'off ') then
call MPI_Waitall(nReqs0,requestsAll,MPI_STATUSES_IGNORE,mpi_err)
end if
if(mpi_size .gt. 1) then
call MPI_Ibarrier(ccsd_communicator, request, mpi_err)
if(mpi_size.gt.1 .and. WRITE_TIMES) then
write(*, "('Rank ',i3,' done with (T) ',
$ 'CPU [min]:',f10.3,2x,'Wall [min]:',7f10.3)")
$ mpi_rank,times(1,95)/60.d0,times(2,95)/60.d0
end if
c if(mpi_rank .eq. 0 .and. mpi_size .gt. 1) then
if(master_thread) then
call MPI_Ibarrier(ccsd_communicator, request, mpi_err)
ptdone = .false.
do while(.not.ptdone)
do !nm = 1, 10
call MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG,
$ ccsd_communicator, flag, stat, mpi_err)
if(.not.flag) exit
if(stat(MPI_TAG) .eq. 0) then
! receive ijk's done and print progress counter
call MPI_Recv(comm_scr, 1, MPI_INTEGER_MRCC,
$ stat(MPI_SOURCE), stat(MPI_TAG),
$ ccsd_communicator, MPI_STATUS_IGNORE,
$ mpi_err)
prk = prk + comm_scr
pr = dble(prk) / dkji
if(pr .ge. prold+0.1d0 .and. pr.lt.1.d0) then
prold = pr
c write(iout, '(i4, "% done.")') nint(pr * 100)
endif
call progress_bar(iout, pr, .false.)
else if(stat(MPI_TAG) .eq. 1) then !ccsdrest .ne. 'off ') then
! receive and save energy contribution from tasks
c call MPI_Recv(esum, 1, MPI_DOUBLE_PRECISION,
c $ stat(MPI_SOURCE), stat(MPI_TAG),
c $ ccsd_communicator, MPI_STATUS_IGNORE,
c $ mpi_err)
c call MPI_Recv(energies, 1, mpi_type_e,
c $ stat(MPI_SOURCE), 1,
c $ ccsd_communicator, MPI_STATUS_IGNORE,
c $ mpi_err)
c write(*,*) 'call write_rst_pt2 comm ',ccsd_communicator
call MPI_Recv(ijk, 1, MPI_INTEGER_MRCC,
$ stat(MPI_SOURCE), 1,
$ ccsd_communicator, MPI_STATUS_IGNORE,
$ mpi_err)
call MPI_Recv(esum, 1, MPI_DOUBLE_PRECISION,
$ stat(MPI_SOURCE), 1,
$ ccsd_communicator, MPI_STATUS_IGNORE,
$ mpi_err)
c ijk = ijk_indices(1)
c esum = energies(1)
c call write_rst_pt(stat(MPI_TAG), esum)
c write(*,*) 'call write_rst_pt2 with ',ijk, esum
call write_rst_pt(ijk, esum)
else
write(iout, '(a,i6)')
$ ' ERROR: Master recevived tag ',
$ stat(MPI_TAG)
call mrccend(1)
end if
end do
! other tasks done with (T)?
call MPI_Test(request, ptdone, MPI_STATUS_IGNORE,
$ mpi_err)
end do
else
if(.not.master_thread .and. ccsdrest .ne. 'off ') then
c call MPI_Waitall(nReqs0,requestsAll,MPI_STATUSES_IGNORE,
call MPI_Waitall(nReqs,requests,MPI_STATUSES_IGNORE,
$ mpi_err)
end if
call MPI_Ibarrier(ccsd_communicator, request, mpi_err)
call MPI_Wait(request, MPI_STATUS_IGNORE, mpi_err)
c call MPI_Barrier(ccsd_communicator, mpi_err)
end if
c call free_manager(ccsd_communicator, counter_k)
call deallocate_counter(ccsd_communicator, count_k)
call free_manager(ccsd_communicator)
call MPI_Allreduce(MPI_IN_PLACE, et, 1, MPI_DOUBLE_PRECISION,
$ MPI_SUM, ccsd_communicator, mpi_err)
call MPI_Allreduce(MPI_IN_PLACE, ets, 1, MPI_DOUBLE_PRECISION,
$ MPI_SUM, ccsd_communicator, mpi_err)
end if
#endif
TIMEADD(times(1,30),times)
TIMEADD(times(1,28),times)
c write(iout,'(" 100% done.")')
call progress_bar(iout, 1.d0, .false.)
if (localccXp(localcc,2022).and.master_thread)
$write(iout,'(a12,2es17.8)') 'old/new (T)',et2,et
if(ccsdrest .ne. 'off') close(725)
c write(iout,*)
c write(iout,'(" CCSD(T) correlation energy [au]:",f22.12)')et
c write(iout,'(" CCSD(T) total energy [au]: ",f22.12)')et+ecc
c write(iout,*)
c call prtenergc('CCSD(T) ',et+ecc,eref,locnoc)
return
end subroutine!}}}
c
C {{{ trf2laplbasis
************************************************************************
subroutine trf2laplbasis(tt,ttl,jij,jab,jia,nocc,nvirt,dfnb,
$abij,abci,aijk,quad,iquad,nquad,cmo,ljapi,lljpai,lljab,
$lajik,laibj,labc,lljij,ljij,lcmo,ilcmo,t,t1,talg,fdo,ofock,ofock1,
$abci_inmem,ttl2,ttijb,ttbij,qm1q)
************************************************************************
c multiply with Laplace factors and trf to the Mayer basis: T2 ,abci, abij and aijk
************************************************************************
implicit none
integer i,j,a,b,nocc,nvirt,dfnb,iquad,c,k,nquad,imax,jlen
real*8 jij(dfnb,nocc,nocc)
real*8 jia(nocc,nvirt,dfnb),cmo(nocc,nocc)
real*8 lcmo(nocc,nocc),ilcmo(nocc,nocc)
real*8 jab(dfnb,nvirt,nvirt),quad(nvirt+nocc,nquad)
real*8 ttl(nvirt,nvirt,nocc,1)
real*8 ttl2(nvirt,nvirt,1,nocc)
real*8 tt(nvirt,nvirt,nocc,nocc),abci(nvirt,nvirt,nvirt,*)
real*8 abij(nvirt,nvirt,nocc,*),aijk(nvirt,nocc,nocc,nocc)
real*8 fa,fb,fb2,t1(nvirt,nocc),t(nvirt,nocc),s
real*8 ljapi(nvirt,dfnb,nocc),qm1q(nocc,nocc)
real*8 lljab(dfnb,nvirt,nvirt),lajik(nvirt,nocc,nocc)
real*8 ttijb(nocc,nocc,nvirt),ttbij(nvirt,nocc,nocc)
real*8 laibj(nvirt,nocc,nvirt,*),labc(nvirt,nvirt,nvirt)
real*8 lljij(nocc,dfnb,nocc),ljij(nocc,dfnb,nocc)
real*8 lljpai(dfnb,nvirt,nocc),fdo(nocc),ofock(nocc,nocc)
real*8 ofock1(nocc,nocc)
character*4 talg
logical lapl,abci_inmem
c
lapl=talg.eq.'lapl'.or.talg.eq.'lato'
c 0) Laplace the canonical index of the occ mo trf mx:
if (lapl) then
do i=1,nocc
lcmo(i,1:nocc)=cmo(i,1:nocc)*quad(i,iquad) ! for barred, in v
enddo
do i=1,nocc
ilcmo(i,1:nocc)=cmo(i,1:nocc)/quad(i,iquad) ! for tilded, in t
enddo
else ! sc or T0
lcmo=cmo
ilcmo=cmo
endif
c 1) Laplace and trf Jij: pij->\tilde j p i -> \tilde j p \bar i
call dgemm('t','t',nocc,dfnb*nocc,nocc,1.d0,ilcmo,nocc,jij,
$dfnb*nocc,0.d0,ljij,nocc) ! in aijk
call dgemm('n','n',nocc*dfnb,nocc,nocc,1.d0,ljij,dfnb*nocc,
$lcmo,nocc,0.d0,lljij,nocc*dfnb) ! in w
c 2) Laplace and trf Jai: iap -> ap\bar i -> p\bar a \bar i
call dgemm('t','n',nvirt*dfnb,nocc,nocc,1.d0,jia,nocc,lcmo,nocc,
$0.d0,ljapi,nvirt*dfnb) ! in aijk
if (lapl) then
do i=1,nocc
do a=1,nvirt
lljpai(1:dfnb,a,i)=ljapi(a,1:dfnb,i)*quad(nocc+a,iquad)
enddo
enddo
else ! sc or T0
do i=1,nocc
call gtrans(ljapi(1,1,i),lljpai(1,1,i),nvirt,dfnb)
enddo
endif
c 3) assembly (aj|ik) and store <ai|jk>
do k=1,nocc
call dgemm('t','t',nvirt*nocc,nocc,dfnb,1.d0,lljpai,
$dfnb,lljij(1,1,k),nocc,0.d0,lajik,nvirt*nocc) ! in rr
do j=1,nocc
do i=1,nocc
aijk(1:nvirt,j,i,k)=lajik(1:nvirt,i,j)
enddo
enddo
enddo
c 4) assembly (ai|bj) and store <ab|ij> for j=1
jlen=1 ! localcc=off case is handled outside
call dgemm('t','n',nvirt*nocc,nvirt*jlen,dfnb,1.d0,lljpai,dfnb,
$lljpai,dfnb,0.d0,laibj,nvirt*nocc) ! in rr
do j=1,jlen
do b=1,nvirt
do i=1,nocc
abij(1:nvirt,b,i,j)=laibj(1:nvirt,i,b,j)
enddo
enddo
enddo
c 5) Laplace Jab -> p \bar a \bar b
if (lapl) then
do a=1,nvirt
fa=quad(nocc+a,iquad)
if (iquad.gt.1) fa=fa/quad(nocc+a,iquad-1) ! jab is owerwritten with lljab of the previous Laplace quadrature point
do b=1,nvirt
fb=quad(nocc+b,iquad)
if (iquad.gt.1) fb=fb/quad(nocc+b,iquad-1)! jab is owerwritten with lljab of the previous Laplace quadrature point
lljab(1:dfnb,b,a)=jab(1:dfnb,b,a)*fa*fb
enddo
enddo
else
lljab=jab
endif
c 6) assembly (ac|bi) for a single i and store <ab|ci> only IF entire <ab|ci> fits into memory
c ELSE save <ab|c1>
imax=nocc
if (.not.abci_inmem) imax=1
do i=1,imax
call dgemm('t','n',nvirt**2,nvirt,dfnb,1.d0,lljab,dfnb,
$lljpai(1,1,i),dfnb,0.d0,labc,nvirt**2) ! labc in w
do b=1,nvirt
do c=1,nvirt
abci(1:nvirt,b,c,i)=labc(1:nvirt,c,b)
enddo
enddo
enddo
c for 7) qm1q = cmo^-1 / quad(iquad-1) * lcmo: iquad-1 GSL basis to iquad GSL basis trf mx
if (iquad.gt.1) then
do i=1,nocc
do j=1,nocc
s=0.d0
do k=1,nocc
s=s+cmo(k,i)*lcmo(k,j)/quad(k,iquad-1)
enddo
qm1q(i,j)=s ! in IabIJ
enddo
enddo
elseif (iquad.eq.1) then
qm1q=lcmo ! for both lapl and sc/T0
endif
c 7) Laplace and trf T2
do a=1,nvirt
if (lapl) then
fa=quad(nocc+a,iquad)
if (iquad.gt.1) fa=fa/quad(nocc+a,iquad-1) ! tt is owerwritten with the previos Laplace quadrature point
else
fa=1.d0
endif
c a) tt(a,b,i,j)->tt_ijb(i,j,b) of \bar a
do j=1,nocc
do i=1,nocc
do b=1,nvirt
ttijb(i,j,b)=tt(a,b,i,j)
enddo
enddo
enddo
c b) tt_ijb(i,j,b) -> fa*tt_jbi(j,b,\bar i)
call dgemm('t','n',nvirt*nocc,nocc,nocc,fa,ttijb,nocc,
$qm1q,nocc,0.d0,lajik,nvirt*nocc) ! in rr
c c) tt_jbi(j,b,\bar i) -> tt_bij(b,\bar i,\bar j)
call dgemm('t','n',nvirt*nocc,nocc,nocc,1.d0,lajik,nocc,
$qm1q,nocc,0.d0,ttbij,nocc*nvirt) ! in w
c d) tt_bij(b,\bar i,\bar j) -> ss(\bar a,\bar b,\bar i,\bar j) & ttl(\bar a,\tilde b,\bar i,1) & ttl2(\bar a,\tilde b,1,\bar i)
do b=1,nvirt
if (lapl) then
fb=quad(nocc+b,iquad)
if (iquad.gt.1) fb=fb/quad(nocc+b,iquad-1) ! tt is owerwritten with the previous Laplace quadrature point
fb2=quad(nocc+b,iquad)**2
else
fb=1.d0
fb2=1.d0
endif
tt(a,b,1:nocc,1:nocc)=ttbij(b,1:nocc,1:nocc)*fb ! this will be ss, same mem space
ttl(a,b,1:nocc,1)=tt(a,b,1:nocc,1)/fb2
ttl2(a,b,1,1:nocc)=tt(a,b,1,1:nocc)/fb2
enddo !b
enddo !a
c 8) Laplace and trf T1
call dgemm('n','n',nvirt,nocc,nocc,1.d0,t1,nvirt,lcmo,nocc,
$0.d0,t,nvirt)
if (lapl) then
do a=1,nvirt
t(a,1:nocc)=t(a,1:nocc)*quad(nocc+a,iquad)
enddo
endif
c 9) compute semi-canonical orbital energies
c write(*,*) 'warning mk Fock in Mayer basis in lapl too'
if (talg.eq.'topr'.or.talg.eq.'to ') then
call dcopy(nocc**2,cmo,1,ofock1,1)
do i=1,nocc
c call dscal(nocc,fdo(i),ofock1(1,i),1)
call dscal(nocc,fdo(i),ofock1(i,1),nocc) ! in w
enddo
call dgemm('t','n',nocc,nocc,nocc,1.d0,cmo,nocc,ofock1,nocc,
$0.d0,ofock,nocc)
do i=1,nocc
fdo(i)=ofock(i,i) ! in v
enddo
endif
c
return
end subroutine trf2laplbasis
c }}}
C {{{ wabc
************************************************************************
subroutine wabc(nocc,nvirt,i,j,k,imem,dcore,t,tt,rr,rrd,
$abc_ofi, abc_ofj, abc_ofk, kinrr,
$jinrr,w,v,abij,aijk,times,talg,ss,iinbl,localcc,IabIJ,lljpai,dfnb,
$tt2,tt3,tt4,ttd,Apbc,abijinmem,jai,ccsdmkl,ptthreads,
$omp_max_threads) !,
c $abckinmem, abcjinmem, abciinmem,n,bcymv,symocc,dgroup,co,
c $first,last,va,irecln,ibufln,nbasis,nirmax,multpg,ccsdalg,jab,jai)
************************************************************************
c called if talg=occ or talg=lapl
c build W and V matrices of the (T) for all a,b,c and a given i,j(,k) index set
c if talg=occ ss=tt=tt2=tt3; ttd=nocc
c if talg=lapl ss.ne.tt due to different Laplace factors & ttd=1
c tt(-,~,-,k) in tt, tt(-,~,k,-) in tt2, tt(-,~,i,j) in tt3, tt(-,~,j,i) in tt4
************************************************************************
implicit none
character*8 ccsdalg
integer a,b,c,i,j,k,rrd,imem,kinrr,nocc,nvirt,l,dfnb
integer ttd,ti,tj,tk,intk, jinrr
real*8 abc_ofi(nvirt, nvirt, nvirt)
real*8 abc_ofj(nvirt, nvirt, nvirt)
real*8 abc_ofk(nvirt, nvirt, nvirt)
real*8 t(nvirt,nocc),tt(nvirt,nvirt,nocc,ttd)
real*8 rr(nvirt,nvirt,nocc,rrd),ss(nvirt,nvirt,nocc,nocc)
real*8 tt2(nvirt,nvirt,ttd,nocc)
real*8 tt4(nvirt,nvirt,ttd,ttd),tt3(nvirt,nvirt,ttd,ttd)
real*8 w(nvirt,nvirt,nvirt),v(nvirt,nvirt,nvirt)
real*8 abij(nvirt,nvirt,nocc,*) !<12|12>, only for j=1 if laplace + localcc
real*8 aijk(nvirt,nocc,nocc,nocc) !<12|12>
real*8 IabIJ(nvirt,nvirt),lljpai(dfnb,nvirt,nocc)
real*8 times(6,0:100),dcore(*),jai(nvirt,nocc,dfnb)
real*8 Apbc(dfnb, nvirt, nvirt)
character*4 talg,localcc
logical lapl,iinbl,abijinmem
real*8 timep(6, 3)
real*8 times0(6, 2)
character*3 ccsdmkl
integer ptthreads, omp_max_threads
integer nthread
integer omp_get_thread_num
c logical abckinmem, abcjinmem, abciinmem
real*8, pointer :: rr_i(:,:,:)
interface
c {{{ interfaces for pointers
subroutine rpoint3d(egydim,haromdim,dim1,dim2,dim3)
implicit none
integer dim1,dim2,dim3
real*8,target :: egydim(dim1,dim2,dim3)
real*8, pointer :: haromdim(:,:,:)
end subroutine
c}}}
subroutine get_abci(i, abc_ofi, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis, nocc, nvirt,
$ nirmax, multpg, ccsdalg, dfnb, jab, jai, dcore, imem, n,
$ addr, ccsdmkl)
implicit none
integer :: i, nvirt, nocc, nbasis, dfnb, imem, n, addr
integer :: irecln, ibufln, nirmax, dgroup
integer :: bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
real*8 :: va(nvirt,nvirt,nvirt,*)
real*8 :: dcore(*)
integer :: co(nbasis)
integer :: first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
real*8, pointer :: abc_ofi(:, :, :)
real*8 :: jai(nvirt, nocc, dfnb), jab(dfnb, nvirt, nvirt)
character*8 :: ccsdalg
character*3 :: ccsdmkl
end subroutine
end interface
#if defined(OMP)
c if(ccsdmkl .eq. 'thr') then
c nthread = 1
c else
nthread = omp_max_threads / ptthreads
if(omp_get_thread_num()+1 .le. mod(omp_max_threads, ptthreads))
$ nthread = nthread + 1
c end if
#endif
lapl=.false.
if (talg.eq.'lapl'.or.talg.eq.'lato') lapl=.true.
if (lapl) then
ti=1 ! position in tt3/tt4
tj=1 ! position in tt3/tt4
tk=1 ! position in tt/tt2
else
ti=i
tj=j
tk=k
endif
intk=k ! localcc=off.and.(talg=occ/lapl)
if (.not.abijinmem) intk=1 ! position of k index in abij if abij is not stored in memory
C III a)
IF(i.eq.j)THEN
c {{{ (i.eq.j)
TIME0(times0(1,2))
c collect to v(b,a,c), then w(a,b,c)=v(b,a,c)+v(a,b,c)
c f) and b) (b here) I_ba,di*T_cd,kj^T
TIME0(times0)
call dgemm_wabc('n', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofj, nvirt**2, tt2(1, 1, tk, j), nvirt,
$ 0.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,37),times0)
c c) and e) (c here ! I symmetry) T_bd,jk*I_da,ci
TIME0(times0)
call dgemm_wabc('n', 'n', nvirt, nvirt**2, nvirt,
$ 1.d0, tt(1, 1, j, tk), nvirt, abc_ofj, nvirt,
$ 1.d0, v, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,38),times0)
cc a) and d) (a here) T_bd,ji*I_ac,dk
TIME0(times0)
call dgemm_wabc('n', 't', nvirt, nvirt**2, nvirt,
$ 1.d0, tt4(1, 1, tj, ti), nvirt, abc_ofk, nvirt**2,
$ 1.d0, v, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,39),times0)
TIMEADD(times(1,13),times0(1,2))
C 2. tag
TIME0(times0(1,2))
c b) and d) (b here) I_bl,jk*R_ac,li
if (rrd.eq.1) then
TIME0(times0)
do l=1,nocc
call tr(nvirt,ss(1,1,l,i),w(1,1,l)) ! transpose a and b of tt(a,b,l)_i to w
enddo
call dgemm_wabc('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1, 1, j, k), nvirt, w, nvirt**2,
$ 1.d0, v, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,57),times0)
elseif (rrd.eq.nocc) then
TIME0(times0)
call dgemm_wabc('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1, 1, j, k), nvirt, rr(1, 1, 1, i), nvirt**2,
$ 1.d0, v, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,57),times0)
endif
c a) and f) (a here) T_ba,li*I_cl,kj
TIME0(times0)
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, ss(1, 1, 1, i), nvirt**2, aijk(1, 1, k, j), nvirt,
$ 1.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,59),times0)
c c) and e) (c here) I_bl,jk*T_ac,lk
if(ccsdmkl .eq. 'thr') then
TIME0(times0)
call dgemm('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1, 1, j, i), nvirt, ss(1, 1, 1, k), nvirt**2,
$ 1.d0, v, nvirt)
TIMEADD(times(1,58),times0)
TIME0(times0)
call dcopy(nvirt**3,v,1,w,1)
do c=1,nvirt
call tradd(nvirt,1.d0,v(1,1,c),w(1,1,c))
enddo
TIMEADD(times(1,63),times0)
else
TIME0(times0)
c$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(b, c, timep) SCHEDULE(DYNAMIC)
c$OMP& NUM_THREADS(nthread)
do c = 1, nvirt
c TIME0(timep(1,3))
call dgemm('n', 't', nvirt, nvirt, nocc,
$ -1.d0, aijk(1,1,j,i), nvirt, ss(1,c,1,k), nvirt**2,
$ 1.d0, v(1, 1, c), nvirt)
! v(:,:,c) should be in cache
do b = 1, nvirt
w(1:nvirt, b, c) = v(1:nvirt, b, c) + v(b, 1:nvirt, c)
end do
end do
c$OMP END PARALLEL DO
TIMEADD(times(1,58),times0)
end if
TIMEADD(times(1,14),times0(1,2))
c }}}
ELSE IF(j.eq.k)THEN !i.ne.j!!!
c {{{ (j.eq.k)
TIME0(times0(1,2))
c collect to v(a,c,b), then w(a,b,c)=v(a,c,b)+v(a,b,c)
c b) and c) (c here, I symmetry) I_da,ci^T * T_bd,jk^T
TIME0(times0)
if (iinbl .or. localcc.eq.'off ') then
call dgemm_wabc('t', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofi, nvirt, tt(1, 1, j, tk), nvirt,
$ 0.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
else
call dgemm_wabc('n', 'n', nvirt, nvirt**2, dfnb,
$ 1.d0, jai(1, i, 1), nvirt*nocc, Apbc, dfnb,
$ 0.d0, v, nvirt, ccsdmkl, 3, nvirt) ! v(a, c, b)= lljpai(P, \bar a, \bar I) Ajk(P, c, b)
end if
TIMEADD(times(1,40),times0)
c a) and f) (a here) I_ac,dk * T_bd,ji^T
TIME0(times0)
call dgemm_wabc('n', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofj, nvirt**2, tt4(1, 1, tj, ti), nvirt,
$ 1.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,41),times0)
c e) and d) (e here) T_ad,ik * I_cb,dj^T
TIME0(times0)
call dgemm_wabc('n', 't', nvirt, nvirt**2, nvirt,
$ 1.d0, tt(1, 1, i, tk), nvirt, abc_ofj, nvirt**2,
$ 1.d0, v, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,42),times0)
TIMEADD(times(1,15),times0(1,2))
C 2. tag
TIME0(times0(1,2))
c a) and b), b: rr(a,c,l,i)*<bl|jk>^T -> v(a,c,b) !NP
if (rrd.eq.1) then
TIME0(times0)
do l=1,nocc
call tr(nvirt,ss(1,1,l,i),w(1,1,l)) ! transpose a and b of tt(a,b,l)_i to w
enddo
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, w, nvirt**2, aijk(1, 1, j, k), nvirt,
$ 1.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,60),times0)
elseif (rrd.eq.nocc) then
TIME0(times0)
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, rr(1, 1, 1, i), nvirt**2, aijk(1, 1, j, k), nvirt,
$ 1.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,60),times0)
endif
c c) and f) (c here) T_ac,lk * I_bl,ji^T
TIME0(times0)
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, ss(1, 1, 1, k), nvirt**2, aijk(1, 1, j, i), nvirt,
$ 1.d0, v, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,61),times0)
c d) and e) (d here) I_al,ik * T_cb,lj^T
if(ccsdmkl .eq. 'thr') then
TIME0(times0)
call dgemm('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1,1,i,k), nvirt, ss(1,1,1,j), nvirt**2,
$ 1.d0, v, nvirt)
TIMEADD(times(1,62),times0)
c
TIME0(times0)
call dcopy(nvirt**3,v,1,w,1)
do c=1,nvirt
do b=1,nvirt
call daxpy(nvirt,1.d0,v(b,c,1),nvirt**2,w(b,1,c),nvirt)
enddo
enddo
TIMEADD(times(1,64),times0)
else
TIME0(times0)
call dfillzero(w, nvirt**3)
c$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(a, b, c, timep)
c$OMP& SCHEDULE(DYNAMIC)
c$OMP& NUM_THREADS(nthread)
do b = 1, nvirt
call dgemm('n', 't', nvirt, nvirt, nocc,
$ -1.d0, aijk(1,1,i,k), nvirt, ss(1,b,1,j), nvirt**2,
$ 1.d0, v(1, 1, b), nvirt)
c$OMP CRITICAL
call daxpy(nvirt**2, 1.d0, v(1, 1, b), 1, w(1, 1, b), 1)
do c = 1, nvirt
call daxpy(nvirt, 1.d0, v(1, c, b), 1, w(1, b, c), 1)
end do
c$OMP END CRITICAL
end do
c$OMP END PARALLEL DO
TIMEADD(times(1,62),times0)
end if
TIMEADD(times(1,16),times0(1,2))
c }}}
ELSE
c {{{ 6 <ab|ci>*T2 terms
TIME0(times0)
c e)
TIME0(times0(1,2))
call dgemm_wabc('n', 'n', nvirt, nvirt**2, nvirt,
$ 1.d0, tt(1, 1, i, tk), nvirt, abc_ofj, nvirt,
$ 0.d0, w, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,47),times0(1,2))
c f)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofj, nvirt**2, tt2(1, 1, tk, i), nvirt,
$ 1.d0, w, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,48),times0(1,2))
c d)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt, nvirt**2, nvirt,
$ 1.d0, tt3(1, 1, ti, tj), nvirt, abc_ofk, nvirt**2,
$ 1.d0, w, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,46),times0(1,2))
c a)
TIME0(times0(1,2))
if(ccsdmkl .eq. 'thr') then
call dgemm('t', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofk, nvirt, tt4(1, 1, tj, ti), nvirt,
$ 0.d0, v, nvirt**2) ! start v(c, a, b)
else
call dgemm_wabc('n', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofk, nvirt**2, tt4(1,1,tj,ti), nvirt,
$ 1.d0, w, nvirt, ccsdmkl, 2, nvirt)
end if
TIMEADD(times(1,43),times0(1,2))
c b)
if (iinbl .or. localcc.eq.'off ') then
TIME0(times0(1,2))
if(ccsdmkl .eq. 'thr') then
call dgemm('n', 'n', nvirt, nvirt**2, nvirt,
$ 1.d0, tt2(1, 1, tk, j), nvirt, abc_ofi, nvirt,
$ 1.d0, v, nvirt)
else
call dgemm_wabc('t', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofi, nvirt, tt2(1, 1, tk, j), nvirt,
$ 1.d0, w, nvirt**2, ccsdmkl, 1, nvirt)
end if
TIMEADD(times(1,44),times0(1,2))
c c)
TIME0(times0(1,2))
if(ccsdmkl .eq. 'thr') then
call dgemm('n', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofi, nvirt**2, tt(1, 1, j, tk), nvirt,
$ 1.d0, v, nvirt**2)
else
call dgemm_wabc('t', 't', nvirt**2, nvirt, nvirt,
$ 1.d0, abc_ofi, nvirt, tt(1, 1, j, tk), nvirt,
$ 1.d0, w, nvirt, ccsdmkl, 2, nvirt)
end if
TIMEADD(times(1,45),times0(1,2))
else
call dgemm_wabc('n', 'n', nvirt, nvirt**2, dfnb,
$ 1.d0, jai(1, i, 1), nvirt*nocc, Apbc, dfnb,
$ 1.d0, w, nvirt, ccsdmkl, 3, nvirt)
endif ! iinbl
TIMEADD(times(1,17),times0)
c }}}
c
c {{{ 6 <ai|jk>*T2 terms
TIME0(times0)
if(ccsdmkl .eq. 'thr') then
c b)
TIME0(times0(1,2))
call dgemm('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, ss(1, 1, 1, i), nvirt**2, aijk(1, 1, j, k), nvirt,
$ 1.d0, v, nvirt**2)
TIMEADD(times(1,51),times0(1,2))
c c)
TIME0(times0(1,2))
call dgemm('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, rr(1,1,1,kinrr), nvirt**2, aijk(1,1,j,i), nvirt,
$ 1.d0, v, nvirt**2)
TIMEADD(times(1,52),times0(1,2))
c v(c,a,b) +> w(a,b,c)
TIME0(times0(1,2))
do c=1,nvirt
call daxpy(nvirt**2,1.d0,v(c,1,1),nvirt,w(1,1,c),1)
enddo
TIMEADD(times(1,49),times0(1,2))
else
c c)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, ss(1,1,1,k), nvirt**2, aijk(1,1,j,i), nvirt,
$ 1.d0, w, nvirt, ccsdmkl, 2, nvirt)
TIMEADD(times(1,52),times0(1,2))
end if
if (rrd.eq.1 .and. ccsdmkl.eq.'thr') then
c a)
TIME0(times0(1,2))
call dgemm('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1, 1, k, j), nvirt, ss(1, 1, 1, i), nvirt**2,
$ 0.d0, v, nvirt) ! start v(c, b, a)
TIMEADD(times(1,50),times0(1,2))
c d)
TIME0(times0(1,2))
call dgemm('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, ss(1, 1, 1, j), nvirt**2, aijk(1, 1, i, k), nvirt,
$ 1.d0, v, nvirt**2)
TIMEADD(times(1,53),times0(1,2))
c v(c,b,a) +> w(a,b,c)
TIME0(times0(1,2))
do c=1,nvirt
do b=1,nvirt
call daxpy(nvirt,1.d0,v(c,b,1),nvirt**2,w(1,b,c),1)
enddo
enddo
TIMEADD(times(1,47),times0(1,2))
else
if(rrd .eq. 1) then
do l=1,nocc
call tr(nvirt,ss(1,1,l,i),v(1,1,l)) ! transpose a and b of tt(a,b,l)_i to v
enddo
call rpoint3d(v, rr_i, nvirt, nvirt, nocc)
else
call rpoint3d(rr(1, 1, 1, i), rr_i, nvirt, nvirt, nocc)
end if
c a)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, rr_i, nvirt**2, aijk(1, 1, k, j), nvirt,
$ 1.d0, w, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,50),times0(1,2))
c d)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1, 1, i, k), nvirt,
$ rr(1, 1, 1, jinrr), nvirt**2,
$ 1.d0, w, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,53),times0(1,2))
c b)
TIME0(times0(1,2))
if(ccsdmkl.eq.'seq') then
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, rr_i, nvirt**2, aijk(1, 1, j, k), nvirt,
$ 1.d0, w, nvirt, ccsdmkl, 2, nvirt)
end if
TIMEADD(times(1,51),times0(1,2))
endif
c f)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt**2, nvirt, nocc,
$ -1.d0, ss(1,1,1,j), nvirt**2, aijk(1,1,k,i), nvirt,
$ 1.d0, w, nvirt**2, ccsdmkl, 1, nvirt)
TIMEADD(times(1,55),times0(1,2))
c e)
TIME0(times0(1,2))
call dgemm_wabc('n', 't', nvirt, nvirt**2, nocc,
$ -1.d0, aijk(1,1,i,j), nvirt, ss(1,1,1,k), nvirt**2,
$ 1.d0, w, nvirt, ccsdmkl, 3, nvirt)
TIMEADD(times(1,54),times0(1,2))
TIMEADD(times(1,18),times0)
c }}}
ENDIF
C calculating V {{{
TIME0(times0(1,2))
call dcopy(nvirt**3,w,1,v,1)
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(c)
c$OMP& NUM_THREADS(nthread)
do c = 1, nvirt
call dger(nvirt, nvirt,
$ 1.d0, t(1,i), 1, abij(1,c,j,intk), 1, v(1,1,c), nvirt)
end do
C$OMP END PARALLEL DO
c
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(c)
c$OMP& NUM_THREADS(nthread)
do c = 1, nvirt
call dger(nvirt, nvirt,
$ 1.d0, abij(1,c,i,intk), 1, t(1,j), 1, v(1,1,c), nvirt)
end do
C$OMP END PARALLEL DO
if (.not.abijinmem.and.j.ne.1) then
call dgemm('n', 't', nvirt, nvirt, dfnb,
$ 1.d0, jai(1,i,1), nvirt*nocc, jai(1,j,1), nvirt*nocc,
$ 0.d0, IabIJ, nvirt) ! assembly the missing abij(.,.,i,j)
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(b)
c$OMP& NUM_THREADS(nthread)
do b = 1, nvirt
call dger(nvirt, nvirt,
$ 1.d0, Iabij(1,b), 1, t(1,k), 1, v(1,b,1), nvirt**2)
end do
C$OMP END PARALLEL DO
else ! abijinmem.or.j.eq.1
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(b)
c$OMP& NUM_THREADS(nthread)
do b = 1, nvirt
call dger(nvirt, nvirt,
$ 1.d0, abij(1,b,i,j), 1, t(1,k), 1, v(1,b,1), nvirt**2)
end do
C$OMP END PARALLEL DO
endif
TIMEADD(times(1,19),times0(1,2)) ! }}}
return
end subroutine wabc
c }}}
c
************************************************************************
subroutine ptindices(icount, nocc, mpi_rank, mpi_size,
$ master_thread, find, lind,
$ notend, max_threads, nind, inds) ! {{{
************************************************************************
implicit none
c include "mpif.h"
integer, intent(in) :: mpi_rank, mpi_size, nocc
integer :: find, lind, max_threads
logical :: notend, master_thread
integer, dimension(0:nocc) :: icount
integer :: k
type indices
integer :: num = 0
integer, allocatable, dimension(:) :: val
end type
type(indices), dimension(mpi_size) :: ind
integer :: isum, i
integer, dimension(mpi_size) :: nums
integer, dimension(nocc) :: inds
integer :: nind
isum = icount(nocc)
do k = nocc, 1, -1
icount(k) = icount(k) - icount(k-1)
end do
nums = 0
do i = 1, mpi_size
allocate(ind(i)%val(nocc))
ind(i)%val = 0
end do
do i = 1, nocc
k = minloc(nums, 1)
nums(k) = nums(k) + icount(i)
ind(k)%num = ind(k)%num + 1
ind(k)%val(ind(k)%num) = i
end do
c if(mpi_rank .eq. 0 .and. mpi_size .gt. 1) then
if(master_thread .and. mpi_size .gt. 1) then
write(*, '("Allocation:")')
do i = 1, mpi_size
write(*, '(i4, " -- ")', advance = 'no') i
write(*, '(100(10i4, :, /, 8x))', advance = 'no')
$ (ind(i)%val(k), k = 1, ind(i)%num)
write(*, '(f12.2, "%")') dble(nums(i)) / isum * 100
end do
end if
nind = ind(mpi_rank+1)%num
inds(1:nocc) = ind(mpi_rank+1)%val(1:nocc)
do i = 1, mpi_size
deallocate(ind(i)%val)
end do
end subroutine ! }}}
************************************************************************
subroutine ptindices_old(icount, nocc, mpi_rank, mpi_size, find,
$ lind, notend, max_threads)
************************************************************************
implicit none
integer, intent(in) :: mpi_rank, mpi_size, nocc
integer :: find, lind, max_threads
logical :: notend
integer, dimension(0:nocc) :: icount
integer :: nrank, k
real*8 :: avgcount
nrank = 0
k = 1
avgcount = dble(icount(nocc)) / mpi_size ! average number of k's to process
do while(k .le. nocc)
if(icount(k)-icount(find-1) .ge. avgcount) then
! hit next rank's final index, check if previous k is more favourable
if(abs(icount(k-1) - icount(find-1) - avgcount) .lt.
$ abs(icount(k) - icount(find-1) - avgcount)) then
if(k .ne. find) then
k = k - 1
end if
end if
if(nrank .eq. mpi_rank) then
! found last index
if(nrank .eq. mpi_size-1) then
! last thread calculates all remaining indices
lind = nocc
max_threads = mpi_size
else
lind = k
end if
exit
else
! next rank's first index
nrank = nrank + 1
find = k + 1
end if
end if
if(k .eq. nocc) then
max_threads = nrank + 1
if(nrank .eq. mpi_rank) then
lind = nocc
else
! too many processes
notend = .false.
end if
end if
k = k + 1
end do
end subroutine
C {{{ wabc
************************************************************************
subroutine wabc_old(nocc,nvirt,i,j,k,imem,dcore,t,tt,rr,rrd,li,lj,
$ind,kinrr,w,v,abij,aijk,times,talg,ss,iinbl,localcc,IabIJ,lljpai,
$dfnb,tt2,tt3,tt4,ttd,abijinmem,jai)
************************************************************************
c called if talg=occ of talg=lapl
c build W and V matrices of the (T) for all a,b,c and a given i,j(,k) index set
c if talg=occ ss=tt=tt2=tt3; ttd=nocc
c if talg=lapl ss.ne.tt due to different Laplace factors & ttd=1
c tt(-,~,-,k) in tt, tt(-,~,k,-) in tt2, tt(-,~,i,j) in tt3, tt(-,~,j,i) in tt4
************************************************************************
implicit none
integer a,b,c,i,j,k,rrd,ind,li,lj,imem,kinrr,nocc,nvirt,l,dfnb
integer ttd,ti,tj,tk,intk
real*8 t(nvirt,nocc),tt(nvirt,nvirt,nocc,ttd)
real*8 rr(nvirt,nvirt,nocc,rrd),ss(nvirt,nvirt,nocc,nocc)
real*8 tt2(nvirt,nvirt,ttd,nocc)
real*8 tt4(nvirt,nvirt,ttd,ttd),tt3(nvirt,nvirt,ttd,ttd)
real*8 w(nvirt,nvirt,nvirt),v(nvirt,nvirt,nvirt)
real*8 abij(nvirt,nvirt,nocc,*) !<12|12>, only for j=1 if laplace + localcc
real*8 aijk(nvirt,nocc,nocc,nocc) !<12|12>
real*8 IabIJ(nvirt,nvirt),lljpai(dfnb,nvirt,nocc)
real*8 times(6,0:100),dcore(*),fact,jai(nvirt,nocc,dfnb)
character*4 talg,localcc
logical lapl,iinbl,abijinmem
lapl=.false.
if (talg.eq.'lapl'.or.talg.eq.'lato') lapl=.true.
fact=0.d0
if (.not.iinbl) fact=1.d0
if (lapl) then
ti=1 ! position in tt3/tt4
tj=1 ! position in tt3/tt4
tk=1 ! position in tt/tt2
else
ti=i
tj=j
tk=k
endif
intk=k ! localcc=off.and.(talg=occ/lapl)
if (.not.abijinmem) intk=1 ! position of k index in abij if abij is not stored in memory
C III a)
IF(i.eq.j)THEN
c {{{ (i.eq.j)
c TIME0(times)
c collect to v(b,a,c), then w(a,b,c)=v(b,a,c)+v(a,b,c)
c f) and b) (b here) I_ba,di*T_cd,kj^T
call dgemm('n','t',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(lj-1)*nvirt**3),nvirt**2,tt2(1,1,tk,j),
$nvirt,0.d0,v,nvirt**2)
c c) and e) (c here ! I symmetry) T_bd,jk*I_da,ci
call dgemm('n','n',nvirt,nvirt**2,nvirt,1.d0,
$tt(1,1,j,tk),nvirt,dcore(imem+(lj-1)*nvirt**3),
$nvirt,1.d0,v,nvirt)
c a) and d) (a here) T_bd,ji*I_ac,dk
call dgemm('n','t',nvirt,nvirt**2,nvirt,1.d0,
$tt4(1,1,tj,ti),nvirt,dcore(imem+(k-ind)*nvirt**3),
$nvirt**2,1.d0,v,nvirt)
c TIMEADD(times(1,13),times)
C 2. tag
c TIME0(times)
c b) and d) (b here) I_bl,jk*R_ac,li
if (rrd.eq.1) then
do l=1,nocc
call tr(nvirt,ss(1,1,l,i),w(1,1,l)) ! transpose a and b of tt(a,b,l)_i to w
enddo
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,j,k),nvirt,
$w,nvirt**2,1.d0,v,nvirt)
elseif (rrd.eq.nocc) then
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,j,k),nvirt,
$rr(1,1,1,i),nvirt**2,1.d0,v,nvirt)
endif
c c) and e) (c here) I_bl,jk*T_ac,lk
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,j,i),nvirt,
$ss(1,1,1,k),nvirt**2,1.d0,v,nvirt)
c a) and f) (a here) T_ba,li*I_cl,kj
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,ss(1,1,1,i),nvirt**2,
$aijk(1,1,k,j),nvirt,1.d0,v,nvirt**2)
c
call dcopy(nvirt**3,v,1,w,1)
do c=1,nvirt
call tradd(nvirt,1.d0,v(1,1,c),w(1,1,c))
enddo
c TIMEADD(times(1,14),times)
c }}}
ELSE IF(j.eq.k)THEN !i.ne.j!!!
c {{{ (j.eq.k)
c TIME0(times)
c collect to v(a,c,b), then w(a,b,c)=v(a,c,b)+v(a,b,c)
c b) and c) (c here, I symmetry) I_da,ci^T * T_bd,jk^T
if (iinbl) call dgemm('t','t',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(li-1)*nvirt**3),nvirt,tt(1,1,j,tk),
$nvirt,0.d0,v,nvirt**2)
c a) and f) (a here) I_ac,dk * T_bd,ji^T
call dgemm('n','t',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(lj-1)*nvirt**3),nvirt**2,tt4(1,1,tj,ti),
$nvirt,1.d0,v,nvirt**2)
c e) and d) (e here) T_ad,ik * I_cb,dj^T
call dgemm('n','t',nvirt,nvirt**2,nvirt,1.d0,
$tt(1,1,i,tk),nvirt,dcore(imem+(lj-1)*nvirt**3),
$nvirt**2,1.d0,v,nvirt)
c TIMEADD(times(1,15),times)
C 2. tag
c TIME0(times)
c a) and b), b: rr(a,c,l,i)*<bl|jk>^T -> v(a,c,b) !NP
if (rrd.eq.1) then
do l=1,nocc
call tr(nvirt,ss(1,1,l,i),w(1,1,l)) ! transpose a and b of tt(a,b,l)_i to w
enddo
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,w,nvirt**2,
$aijk(1,1,j,k),nvirt,1.d0,v,nvirt**2)
elseif (rrd.eq.nocc) then
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,rr(1,1,1,i),nvirt**2,
$aijk(1,1,j,k),nvirt,1.d0,v,nvirt**2)
endif
c c) and f) (c here) T_ac,lk * I_bl,ji^T
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,ss(1,1,1,k),nvirt**2,
$aijk(1,1,j,i),nvirt,1.d0,v,nvirt**2)
c d) and e) (d here) I_al,ik * T_cb,lj^T
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,i,k),nvirt,
$ss(1,1,1,j),nvirt**2,1.d0,v,nvirt)
c
call dcopy(nvirt**3,v,1,w,1)
do c=1,nvirt
do b=1,nvirt
call daxpy(nvirt,1.d0,v(b,c,1),nvirt**2,w(b,1,c),nvirt)
enddo
enddo
c TIMEADD(times(1,16),times)
c }}}
ELSE
c {{{ 6 <ab|ci>*T2 terms
c TIME0(times(1,99))
c e)
c TIME0(times)
call dgemm('n','n',nvirt,nvirt**2,nvirt,1.d0,
$tt(1,1,i,tk),nvirt,dcore(imem+(lj-1)*nvirt**3),nvirt,fact,w,nvirt)
c TIMEADD(times(1,45),times)
c f)
c TIME0(times)
if (lapl) then
call dgemm('n','t',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(lj-1)*nvirt**3),nvirt**2,
$tt2(1,1,tk,i),nvirt,1.d0,w,nvirt**2)
else
call dgemm('n','n',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(lj-1)*nvirt**3),nvirt**2,
$tt(1,1,i,tk),nvirt,1.d0,w,nvirt**2)
endif
c TIMEADD(times(1,46),times)
c d)
c TIME0(times)
call dgemm('n','t',nvirt,nvirt**2,nvirt,1.d0,
$tt3(1,1,ti,tj),nvirt,dcore(imem+(k-ind)*nvirt**3),
$nvirt**2,1.d0,w,nvirt)
c TIMEADD(times(1,44),times)
c a)
c TIME0(times)
if (lapl) then
call dgemm('t','t',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(k-ind)*nvirt**3),nvirt,tt4(1,1,tj,ti),
$nvirt,0.d0,v,nvirt**2) ! start v(c,a,b)
else
call dgemm('t','n',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(k-ind)*nvirt**3),nvirt,tt3(1,1,ti,tj),
$nvirt,0.d0,v,nvirt**2) ! start v(c,a,b)
endif
c TIMEADD(times(1,40),times)
c b)
c TIME0(times)
if (iinbl) then
call dgemm('n','n',nvirt,nvirt**2,nvirt,1.d0,tt2(1,1,tk,j),
$nvirt,dcore(imem+(li-1)*nvirt**3),nvirt,1.d0,v,nvirt)
c TIMEADD(times(1,42),times)
c c)
c TIME0(times)
if (lapl) then
call dgemm('n','t',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(li-1)*nvirt**3),nvirt**2,
$tt(1,1,j,tk),nvirt,1.d0,v,nvirt**2)
else
call dgemm('n','n',nvirt**2,nvirt,nvirt,1.d0,
$dcore(imem+(li-1)*nvirt**3),nvirt**2,
$tt2(1,1,tk,j),nvirt,1.d0,v,nvirt**2)
endif
endif ! iinbl
c TIMEADD(times(1,43),times)
c TIMEADD(times(1,17),times(1,99))
c }}}
c
c {{{ 6 <ai|jk>*T2 terms
c TIME0(times(1,99))
c b)
c TIME0(times)
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,ss(1,1,1,i),nvirt**2,
$aijk(1,1,j,k),nvirt,1.d0,v,nvirt**2)
c TIMEADD(times(1,51),times)
c c)
c TIME0(times)
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,rr(1,1,1,kinrr),
$nvirt**2,aijk(1,1,j,i),nvirt,1.d0,v,nvirt**2)
c TIMEADD(times(1,52),times)
c v(c,a,b) +> w(a,b,c)
c TIME0(times)
do c=1,nvirt
call daxpy(nvirt**2,1.d0,v(c,1,1),nvirt,w(1,1,c),1)
enddo
c TIMEADD(times(1,41),times)
if (rrd.eq.1) then
c a)
c TIME0(times)
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,k,j),nvirt,
$ ss(1,1,1,i),nvirt**2,0.d0,v,nvirt) ! start v(c,b,a)
c TIMEADD(times(1,50),times)
c d)
c TIME0(times)
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,ss(1,1,1,j),nvirt**2,
$aijk(1,1,i,k),nvirt,1.d0,v,nvirt**2)
c TIMEADD(times(1,53),times)
c v(c,b,a) +> w(a,b,c)
c TIME0(times)
do c=1,nvirt
do b=1,nvirt
call daxpy(nvirt,1.d0,v(c,b,1),nvirt**2,w(1,b,c),1)
enddo
enddo
c TIMEADD(times(1,47),times)
elseif(rrd.eq.nocc) then
c a)
c TIME0(times)
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,rr(1,1,1,i),nvirt**2,
$aijk(1,1,k,j),nvirt,1.d0,w,nvirt**2)
c TIMEADD(times(1,50),times)
c d)
c TIME0(times)
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,i,k),nvirt,
$ rr(1,1,1,j),nvirt**2,1.d0,w,nvirt)
c TIMEADD(times(1,53),times)
endif
c f)
c TIME0(times)
call dgemm('n','t',nvirt**2,nvirt,nocc,-1.d0,ss(1,1,1,j),nvirt**2,
$aijk(1,1,k,i),nvirt,1.d0,w,nvirt**2)
c TIMEADD(times(1,55),times)
c e)
c TIME0(times)
call dgemm('n','t',nvirt,nvirt**2,nocc,-1.d0,aijk(1,1,i,j),nvirt,
$ss(1,1,1,k),nvirt**2,1.d0,w,nvirt)
c TIMEADD(times(1,54),times)
c TIMEADD(times(1,18),times(1,99))
c }}}
ENDIF
C calculating V
c TIME0(times)
c call dcopy(nvirt**3,w,1,v,1)
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(a)
do a=1,nvirt
v(a,1:nvirt,1:nvirt)=w(a,1:nvirt,1:nvirt)
$ +t(a,i)*abij(1:nvirt,1:nvirt,j,intk)
enddo
C$OMP END PARALLEL DO
c
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(b)
do b=1,nvirt
v(1:nvirt,b,1:nvirt)=v(1:nvirt,b,1:nvirt)
$ +t(b,j)*abij(1:nvirt,1:nvirt,i,intk)
enddo
C$OMP END PARALLEL DO
c
if (.not.abijinmem.and.j.ne.1) then
if (lapl.and.localcc.ne.'off ') then ! abij(.,.,i,j) is not available
call dgemm('t','n',nvirt,nvirt,dfnb,1.d0,lljpai(1,1,i),dfnb,
$ lljpai(1,1,j),dfnb,0.d0,IabIJ,nvirt) ! assembly the missing abij(.,.,i,j)
else ! talg=occ , jlen=1
call dgemm('n','t',nvirt,nvirt,dfnb,1.d0,jai(1,i,1),nvirt*nocc,
$ jai(1,j,1),nvirt*nocc,0.d0,IabIJ,nvirt) ! assembly the missing abij(.,.,i,j)
endif
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(c)
do c=1,nvirt
v(1:nvirt,1:nvirt,c)=v(1:nvirt,1:nvirt,c)
$ +t(c,k)*Iabij(1:nvirt,1:nvirt)
enddo
C$OMP END PARALLEL DO
else ! abijinmem.or.j.eq.1
C$OMP PARALLEL DO Schedule(Dynamic) Default(Shared)
C$OMP& PRIVATE(c)
do c=1,nvirt
v(1:nvirt,1:nvirt,c)=v(1:nvirt,1:nvirt,c)
$ +t(c,k)*abij(1:nvirt,1:nvirt,i,j)
enddo
C$OMP END PARALLEL DO
endif
c do a=1,nvirt
c do b=1,nvirt
c do c=1,nvirt
c write(*,'(6i4,2es14.6)') k,j,i,a,b,c,w(a,b,c),v(a,b,c)
c enddo
c enddo
c enddo
return
end subroutine
c }}}
************************************************************************
subroutine dgemm_wabc(tra, trb, ndim1, ndim2, ndim3,
$ f1, a, lda, b, ldb, f2, c, ldc, ccsdmkl, v, d)
************************************************************************* ! {{{
c v = 0 : c(i, j, k) = a(i, j, l) * b(l, k) or a(i, l) * b(l, j, k)
c v = 1 : for j: c(i, j, k) = a(i, j, l) * b(l, k)
c v = 2 : for k: c(i, j, k) = a(i, k, l) * b(l, j)
c v = 3 : for k: c(i, j, k) = a(i, l) * b(l, j, k)
implicit none
integer ::v, d, ndim1, ndim2, ndim3, lda, ldb, ldc
real*8 :: a(lda, *), b(ldb, *), c(ldc, *), f1, f2
character :: tra, trb
character(len = 3) :: ccsdmkl
integer :: l
integer :: nthread, ptthreads
integer :: omp_max_threads, ccsdthreads, mkl_max_threads
common /threads/ ccsdthreads, ptthreads, omp_max_threads,
$ mkl_max_threads
integer omp_get_thread_num, nthreads
nthreads = omp_max_threads / ptthreads
#ifdef OMP
if(omp_get_thread_num()+1 .le. mod(omp_max_threads, ptthreads))
$ nthreads = nthreads + 1
#endif
c#ifdef MKL
c if(ccsdmkl .eq. 'seq') call mkl_set_num_threads(1)
c#endif
if(ccsdmkl .eq. 'thr' .or. v .eq. 0) then
call dgemm(tra, trb, ndim1, ndim2, ndim3,
$ f1, a, lda, b, ldb,
$ f2, c, ldc)
else if(v .eq. 1) then
if(tra .eq. 'n' .or. tra .eq. 'N') then
c$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(SHARED) PRIVATE(l)
c$OMP& NUM_THREADS(nthreads)
do l = 1, d
call dgemm(tra, trb, ndim1/d, ndim2, ndim3,
$ f1, a(ndim1/d*(l-1)+1, 1), lda, b, ldb,
$ f2, c(ndim1/d*(l-1)+1, 1), ldc)
end do
c$OMP END PARALLEL DO
else
c$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(SHARED) PRIVATE(l)
c$OMP& NUM_THREADS(nthreads)
do l = 1, d
call dgemm(tra, trb, ndim1/d, ndim2, ndim3,
$ f1, a(1, ndim1/d*(l-1)+1), lda, b, ldb,
$ f2, c(ndim1/d*(l-1)+1, 1), ldc)
end do
c$OMP END PARALLEL DO
end if
else if(v .eq. 2) then
if(tra .eq. 'n' .or. tra .eq. 'N') then
c$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(SHARED) PRIVATE(l)
c$OMP& NUM_THREADS(nthreads)
do l = 1, d
call dgemm(tra, trb, ndim1/d, ndim2, ndim3,
$ f1, a(ndim1/d*(l-1)+1, 1), lda, b, ldb,
$ f2, c(1, ndim2*(l-1)+1), ldc)
end do
c$OMP END PARALLEL DO
else
c$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(SHARED) PRIVATE(l)
c$OMP& NUM_THREADS(nthreads)
do l = 1, d
call dgemm(tra, trb, ndim1/d, ndim2, ndim3,
$ f1, a(1, ndim1/d*(l-1)+1), lda, b, ldb,
$ f2, c(1, ndim2*(l-1)+1), ldc)
end do
c$OMP END PARALLEL DO
end if
else if(v .eq. 3) then
if(trb .eq. 'n' .or. trb .eq. 'N') then
c$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(SHARED) PRIVATE(l)
c$OMP& NUM_THREADS(nthreads)
do l = 1, d
call dgemm(tra, trb, ndim1, ndim2/d, ndim3,
$ f1, a, lda, b(1, ndim2/d*(l-1)+1), ldb,
$ f2, c(1, ndim2/d*(l-1)+1), ldc)
end do
c$OMP END PARALLEL DO
else
c$OMP PARALLEL DO SCHEDULE(DYNAMIC) DEFAULT(SHARED) PRIVATE(l)
c$OMP& NUM_THREADS(nthreads)
do l = 1, d
call dgemm(tra, trb, ndim1, ndim2/d, ndim3,
$ f1, a, lda, b(ndim2/d*(l-1)+1, 1), ldb,
$ f2, c(1, ndim2/d*(l-1)+1), ldc)
end do
c$OMP END PARALLEL DO
end if
end if
c#ifdef MKL
c if(ccsdmkl .eq. 'seq') call mkl_set_num_threads(nthread)
c#endif
end subroutine ! }}}
************************************************************************
subroutine get_abci(i, abc_ofi, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis, nocc, nvirt,
$ nirmax, multpg, ccsdalg, dfnb, jab, jai, dcore, imem, n,
$ addr, ccsdmkl)
************************************************************************
implicit none
integer :: i, nvirt, nocc, nbasis, dfnb, imem, n, addr
integer :: irecln, ibufln, nirmax, dgroup
integer :: bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
real*8 :: va(nvirt,nvirt,nvirt,*)
real*8 :: dcore(*)
integer :: co(nbasis)
integer :: first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
real*8, pointer :: abc_ofi(:, :, :)
real*8 :: jai(nvirt, nocc, dfnb), jab(dfnb, nvirt, nvirt)
character*8 :: ccsdalg
character*3 :: ccsdmkl
integer :: a, b
real*8 :: scr1v(nvirt)
c call rpoint3d(dcore(addr), abc_ofi, nvirt, nvirt, nvirt)
c write(*,*) "after pointer"
if(ccsdalg .eq. 'disk ') then
call abciread(i, 1, abc_ofi, bsymmv, symmocc, dgroup,
$ co, first, last, va, irecln, ibufln, nbasis,
$ nocc, nvirt, nirmax, multpg)
else if(ccsdalg .eq. 'dfdirect') then
! (ac|bi) = <ab|ci>
call dgemm_wabc('t', 't', nvirt**2, nvirt, dfnb,
$ 1.d0, jab, dfnb, jai(1, i, 1), nvirt*nocc,
$ 0.d0, abc_ofi, nvirt**2, ccsdmkl, 1, nvirt)
! acbi -> abci
do a = 1, nvirt
do b = 1, nvirt
scr1v(b+1:nvirt) = abc_ofi(a, b+1:nvirt, b)
abc_ofi(a, b+1:nvirt, b) =
$ abc_ofi(a, b, b+1:nvirt)
abc_ofi(a, b, b+1:nvirt) = scr1v(b+1:nvirt)
end do
end do
end if
end subroutine
C
************************************************************************
subroutine ttsort(tt,ttscr,nocc,nvirt)
************************************************************************
implicit none
integer nocc, nvirt
real*8 tt(nocc,nocc,nvirt,nvirt)
real*8 ttscr(nvirt,nvirt,nocc,nocc)
c real*8,allocatable :: ttscr(:,:,:,:)
integer i,j
c allocate(ttscr(nvirt,nvirt,nocc,nocc))
call dcopy(nvirt**2*nocc**2,tt,1,ttscr,1)
do j=1,nocc
do i=1,nocc
call dcopy(nvirt**2,ttscr(1,1,i,j),1,tt(i,j,1,1),nocc**2)
enddo
enddo
return
end subroutine
C
************************************************************************
subroutine sortabci(bsymmv,symmocc,dgroup,co,
$first,last,w,
$maxcor,imem,imem1,dcore,nocc,nvirt,nbasis,iout,nirmax,
$irecln,ibufln,multpg)
************************************************************************
implicit none
integer maxcor, imem, imem1
real*8 dcore(*)
integer nocc, nvirt, nbasis
integer iout
integer nirmax, dgroup
integer bsymmv(nirmax+1),symmocc(nirmax),multpg(nirmax,nirmax)
integer irecln, ibufln
integer co(nbasis)
integer first(0:5,8,8,nbasis),last(0:5,8,8,nbasis)
real*8 w(*)
integer imem2
integer n,nw,ind,ind2,avmem
logical notend,notend2
ind=1
notend=.true.
do while(notend)
C calculate free memory for old & new list
if(maxcor-(imem-imem1).ge.nocc*nvirt**2+nvirt**3)then
c avmem=maxcor-(imem-imem1)+max(nocc**3,ibufln)+nocc**3-nvirt**3 !SzL bug if ibufln>V**3-O**3
c imem2=imem-max(nocc**3,ibufln)-nocc**3+nvirt**3 !SzL
avmem=maxcor-(imem-imem1) !NP
imem2=imem
n=(avmem-nocc*nvirt**2-modulo(avmem-nocc*nvirt**2,nvirt**3))/
$nvirt**3
nw=(avmem-n*nvirt**3-modulo(avmem-n*nvirt**3,nocc*nvirt**2))/
$(nocc*nvirt**2)
else
write(iout,*)'Insufficient memory!'
call mrccend(1)
endif
if(ind+n-1.ge.nocc)then
notend=.false.
n=nocc-ind+1
nw=(avmem-n*nvirt**3-modulo(avmem-n*nvirt**3,nocc*nvirt**2))/
$(nocc*nvirt**2)
endif
C read old list
call abciread(ind,n,dcore(imem2),bsymmv,symmocc,dgroup,co,
$first,last,w,
$irecln,ibufln,nbasis,nocc,nvirt,nirmax,multpg)
C read&write new list
ind2=1
notend2=.true.
do while(notend2)
if(ind2+nw-1.ge.nvirt)then
notend2=.false.
nw=nvirt-ind2+1
endif
if(ind.ne.1)then
call iabcread(nw,ind2,dcore(imem2+n*nvirt**3),w,
$ nocc,nvirt,irecln,ibufln)
endif
call atpakol(n,nw,ind,ind2,dcore(imem2),
$dcore(imem2+n*nvirt**3),nvirt,nocc)
call iabcwrite(nw,ind2,dcore(imem2+n*nvirt**3),w,
$ nocc,nvirt,irecln,ibufln)
ind2=ind2+nw
enddo
ind=ind+n
enddo
return
end subroutine
c
C {{{ rralloc
************************************************************************
subroutine rralloc(avmem,nocc,nvirt,itrr,rrd,restofarrays)
************************************************************************
c determine and allocate the size of the rr array storing the transpose of T2
************************************************************************
implicit none
integer avmem,nocc,nvirt,restofarrays,dblalloc,itrr,rrd
integer ccsdthreads, ptthreads, omp_max_threads, mkl_max_threads
common /threads/ ccsdthreads, ptthreads, omp_max_threads,
$ mkl_max_threads
c
if (avmem-restofarrays.ge.nvirt**2*nocc**2) then
itrr=dblalloc(nvirt**2*nocc**2) ! store also rr=tt^traspose for the full tt
rrd=nocc
else
itrr=dblalloc(nvirt*nocc*max(nvirt,nocc)*ptthreads) ! v*o**2 for lajik
rrd=1
endif
c
return
end subroutine
c }}}
************************************************************************
subroutine progress_bar(iout, pr, first_call)
************************************************************************
c Prints progress bar based on pr containing the value of the progress normalized to unity.
implicit none
real*8, intent(in) :: pr
integer, intent(in) :: iout
logical, intent(in) :: first_call
integer, save :: pr_prev
integer :: pr_current, i
logical :: last_call
integer, parameter :: length = 42
character(len = length) ::
$ bar = ' |---------|---------|---------|---------|'
if(first_call) then
write(iout, '(a44)')
$ ' 0% 25% 50% 75% 100%'
pr_prev = 2
write(iout, '(a2)', advance = 'no') bar(1:pr_prev)
flush(iout)
end if
pr_current = min(nint(pr / 0.025d0), 40) + 2
last_call = pr_current.eq.length .and. pr_prev.ne.length
do i = pr_prev+1, pr_current
write(iout, '(a1)', advance = 'no') bar(i:i)
flush(iout)
end do
if(last_call) then
write(iout, *)
flush(iout)
end if
pr_prev = max(pr_current, pr_prev)
end subroutine