I looked at another application of dot_product and wrote a simple test example, which I compiled with default and /opt compilation modes.
The relative performance of each option on my core i5 are dramatic.
module big_arrays
integer*4, parameter :: L = 1000
integer*4, parameter :: M = 1000
integer*4, parameter :: N = 1000
!
real*8 A(l,m), B(m,n), C(l,n)
end module big_arrays
use big_arrays
real*8 t(6), s(5)
integer*4 i,k
!
call random_number (A)
call random_number (B)
!
do k = 1,2
!
call cpu_time (t(1))
C = MATMUL (A,B)
s(1) = sum(C)
!
call cpu_time (t(2))
call MAT_MUL (A,B,C, l,m,n)
s(2) = sum(C)
!
call cpu_time (t(3))
call MAT_MUL_d (A,B,C, l,m,n)
s(3) = sum(C)
!
call cpu_time (t(4))
call MAT_MUL_c (A,B,C, l,m,n)
s(4) = sum(C)
!
call cpu_time (t(5))
call MAT_MUL_a (A,B,C, l,m,n)
s(5) = sum(C)
!
call cpu_time (t(6))
write (*,'(6f10.4)') (t(i+1)-t(i), i=1,5)
write (*,'(6es10.2)') (s(i)-s(1), i=1,5)
end do
end
subroutine MAT_MUL (A,B,C, l,m,n)
!
integer*4 l,m,n
real*8 A(l,m), B(m,n), C(l,n)
!
integer*4 i,j,k
real*8 s
!
do i = 1,l
do j = 1,n
s = 0
do k = 1,m
s = s + a(i,k)*b(k,j)
end do
c(i,j) = s
end do
end do
end
subroutine MAT_MUL_d (A,B,C, l,m,n)
!
integer*4 l,m,n
real*8 A(l,m), B(m,n), C(l,n)
!
integer*4 i,j
!
do i = 1,l
do j = 1,n
c(i,j) = dot_product (a(i,:), b(:,j))
end do
end do
end
subroutine MAT_MUL_c (A,B,C, l,m,n)
!
integer*4 l,m,n
real*8 A(l,m), B(m,n), C(l,n)
!
integer*4 i,j
real*8 column(m)
!
do i = 1,l
column = a(i,:)
do j = 1,n
c(i,j) = dot_product (column, b(:,j))
end do
end do
end
subroutine MAT_MUL_a (A,B,C, l,m,n)
!
integer*4 l,m,n
real*8 A(l,m), B(m,n), C(l,n)
!
integer*4 i,j
real*8 fast_asm_ddotprod, column(m)
external fast_asm_ddotprod
!
do i = 1,l
column = a(i,:)
do j = 1,n
c(i,j) = fast_asm_ddotprod (column, b(1,j), m)
end do
end do
end
Run performance I obtained is:
[FTN95/Win32 Ver. 6.10.0 Copyright (c) Silverfrost Ltd 1993-2011]
PROCESSING MODULE [<BIG_ARRAYS> FTN95/Win32 v6.10.0]
NO ERRORS [<BIG_ARRAYS> FTN95/Win32 v6.10.0]
NO ERRORS [<main program> FTN95/Win32 v6.10.0]
NO ERRORS [<MAT_MUL> FTN95/Win32 v6.10.0]
NO ERRORS [<MAT_MUL_D> FTN95/Win32 v6.10.0]
NO ERRORS [<MAT_MUL_C> FTN95/Win32 v6.10.0]
NO ERRORS [<MAT_MUL_A> FTN95/Win32 v6.10.0]
NO ERRORS [<FAST_ASM_DDOTPROD> FTN95/Win32 v6.10.0]
Creating executable: c:\\TEMP\\FTN95_test\\lgotemp@.exe
Program entered
11.7001 7.6596 7.6440 1.1388 0.7020
0.00E+00 0.00E+00 0.00E+00 0.00E+00 0.00E+00
Thats a 17 x improvement using David's new dot_product approach.