Dan,
I've generated a simplified code, based on the direct reduction of a profile solver. I've included 6 inner loop options, the 4 I presented plus 2 variations of Paul's post.
It should be compiled with /lgo or /opt /lgo.
To change the size of the problem, I'd recommend changing 'max_size'. It can run up to 2gb on Win_7-64 or probably 1.3gb on XP_32.
The results show that the wrapper improves by a factor of 3 with FTN95 /opt.
I need to test it with other compilers.
253 lines of code, so see how much can be copied per post.
John
module big_arrays
integer*4, parameter :: mb_8 = 1024*1024/8 ! real*8 variables in 1 mb
!
integer*4, parameter :: max_eqn = 200000 ! maximum equations possible
integer*4, parameter :: max_band = 2800 ! maximum profile size
integer*4, parameter :: max_size = 100*mb_8 ! maximum size profile matrix (CHANGE THIS)
!
integer*4, allocatable, dimension(:) :: nb ! profile storage
real*8, allocatable, dimension(:) :: B ! matrix storage
!
integer*4 neq
integer*4 mstor
integer*4 avg_band
!
end module big_arrays
use big_arrays
!
integer*4 ieq, iband, ipos, method, k
real*8 c_start(6), c_end(6), x, dt, ops_per_sec
integer*8 num_ops, num_call
!
character method_name(6)*30
data method_name / 'Original DO Loop', 'F90 syntax', 'F77 wrapper for DO Loop', &
'F77 wrapper for Dot_Product', 'Paul option', 'alternate Paul option' /
!
! generate profile in nb
!
allocate ( nb(max_eqn) )
!
iband = 1
nb(1) = 1
nb(2) = 0
do ieq = 1, max_eqn
call random_number (x)
iband = (iband + x * max_band) / 2
nb(ieq+2) = nb(ieq+1) + min (iband,ieq)
if (nb(ieq+2) > max_size) exit
end do
!
! generate diagonal pointer
neq = ieq-1
do ieq = 0,neq
nb(ieq+2) = nb(ieq+2) + neq+2
end do
!
mstor = nb(neq+2)
avg_band = mstor / neq
x = real(mstor) / real(mb_8)
!
write (*,2001) neq, ' equations'
write (*,2001) avg_band, ' average profile'
write (*,2001) mstor, ' coefficients'
write (*,2002) x , ' storage (mb)'
write (*,1001) ' '
2001 format (i10,a)
2002 format (f10.2,a)
!
! initialise the matrix
!
allocate ( B(mstor) )
do ieq = 1,neq
ipos = ieq+2
iband = nb(ipos) - nb(ipos-1)
do k = nb(ipos-1)+1, nb(ipos)
b(k) = 1
end do
b(nb(ipos)) = iband
end do
!
do method = 1,6
!
call cpu_time (c_start(method))
!
call redblk (B, NB, B, NB, method, num_ops, num_call)
!
call cpu_time (c_end(method))
dt = c_end(method)-c_start(method)
ops_per_sec = dble (num_ops) / dt
write (*,1001) ' Method', method, ' CPU_time =',dt,' ops/sec =',ops_per_sec, method_name(method)
end do
1001 format (a,i2,a,f9.3,a,en12.2,2x,a)
!
write (*,*) num_call,' Number of dot_product calls'
write (*,*) num_ops, ' Number of itterations'
!
end
SUBROUTINE REDBLK (A, NA, B, NB, method, num_ops, num_call)
!
! reduces block 'A' by block 'B'
!
REAL*8 A(*), B(*)
INTEGER*4 NA(*), NB(*), method
integer*8 num_ops, num_call
INTRINSIC MIN, MAX
!
INTEGER*4 I, IBOT, ITOP, IPOINT, A_ptr_0, A_ptr_I, IL, &
J, JBOT, JTOP, JB, JT, JI