Here is my code for doing the multi-thread matrix multiplication. I need to make it 'thread wise'. Vec_Sum is a Dot_Product, which can be replaced by David's SSE code.
John
subroutine matmul_thread_test (A,B,C, chk, l,m,n, times)
!
! A(1000000,100), 800 mb
! B(100,10), 8 kb
! C(1000000,10) 80 mb
!
integer*4 l,m,n
real*8 A(l,m), B(m,n), C(l,n), chk(l,n)
real*4 times(2)
!
integer*4 thread, n_thread
real*4 err_max, ts(2), te(2)
external err_max
!
! 8) If row of A is used - sequential Vec_Sum
! 3 threads
!
C = 0
call time_step (ts)
n_thread = 3
do thread = 1,n_thread
!
call matmul_this_thread (thread, n_thread, A,B,C, l,m,n)
!
end do
call time_step (te) ; times = te - ts
write (*,*) times, ' 8) row Vector_sum Thread ', err_max (c, chk, l,n)
!
end
subroutine matmul_this_thread (thread, n_thread, A,B,C, l,m,n)
!
integer*4 thread, n_thread
integer*4 l,m,n
real*8 A(l,m), B(m,n), C(l,n)
!
integer*4 i,j
real*8, dimension(:), allocatable :: row
real*8 Vec_Sum
external Vec_Sum
!
allocate ( row(m) )
!
do i = thread,l,n_thread ! l = 1000000
row(1:m) = A(i,1:m) ! m = 100
do j = 1,n ! n = 10
C(i,j) = Vec_Sum (row, B(1,j), m) ! m = 100
end do
end do
!
deallocate ( row )
end
real*8 function Vec_Sum (A, B, n)
integer*4 n, i
real*8 A(n), B(n), s
!
s = 0
do i = 1,n
s = s + a(i)*b(i)
end do
Vec_Sum = s
end