|
forums.silverfrost.com Welcome to the Silverfrost forums
|
View previous topic :: View next topic |
Author |
Message |
JohnCampbell
Joined: 16 Feb 2006 Posts: 2556 Location: Sydney
|
Posted: Mon Dec 11, 2017 12:58 pm Post subject: |
|
|
The following is an example where the number of threads can be varied more easily.
The same routine is called for multiple threads, with the "arg" indicating which thread is being called.
It appear to work, giving the same answer.
Code: | module data_mod
implicit none
integer, parameter :: dp=kind(1.d0)
real(kind=dp) ds
real(kind=dp) r1,x1,x2,r2,v1,ns,tmax(4),smax(4)
integer,parameter::IO_LOCK = 42
integer max_threads
real*4 times(2)
contains
subroutine time_start
call get_times (times)
end subroutine time_start
subroutine time_end
real*4 now(2)
call get_times (now)
times = now - times
end subroutine time_end
subroutine get_times (now)
real*4 now(2)
call elapse_time (now(1))
call cpu_time (now(2))
end subroutine get_times
subroutine elapse_time (seconds)
real*4 seconds
integer*8 tick, rate
call system_clock (tick, rate)
seconds = dble(tick) / dble(rate)
end subroutine elapse_time
subroutine search1 (ithread)
include<windows.ins>
integer, intent(in) :: ithread
real(kind=dp) s_loc, slim
integer n_do
call lock@(IO_LOCK)
write(6,*) 'Starting A thread ',ithread
call unlock@(IO_LOCK)
slim = dble(ithread)/dble(max_threads)
s_loc = epsilon(1.0d0) + slim - 1.0d0/dble(max_threads)
n_do = 0
do while (s_loc <= slim)
s_loc = s_loc + ds
n_do = n_do+1
end do
call lock@(IO_LOCK)
write(6,*) 'Completed A thread ',ithread, n_do, s_loc
call unlock@(IO_LOCK)
end subroutine search1
subroutine searchX (ithread)
include<windows.ins>
integer, intent(in) :: ithread
real(kind=dp) s_loc, slim, tmax_loc, t_loc, k_loc, smax_loc
complex(kind=dp) i_loc
integer n_do
call lock@ (IO_LOCK)
write(6,*) 'Starting C1 thread',ithread
call unlock@ (IO_LOCK)
tmax_loc = 0.d0
k_loc = 90.d0/(4.d0*atan(1.d0)*ns)
slim = dble(ithread)/dble(max_threads)
s_loc = epsilon(1.0d0) + slim - 1.0d0/dble(max_threads)
n_do = 0
do while (s_loc <= slim)
i_loc = v1/cmplx((r1+r2/s_loc),(x1+x2),kind=dp)
!!! t_loc = (k_loc)*(abs(i_loc)*abs(i_loc)*(r2/s_loc))
t_loc = (k_loc)*((abs(i_loc)**2)*(r2/s_loc))
if (t_loc > tmax_loc) then
tmax_loc = t_loc
smax_loc = s_loc
end if
s_loc = s_loc + ds
n_do = n_do+1
end do
!z call lock@ (ithread) ! ithread should be hh(ithread) ??
tmax(ithread) = tmax_loc
smax(ithread) = smax_loc
!z call unlock@ (ithread)
call lock@ (IO_LOCK)
write(6,*) 'Completed C1 thread',ithread, n_do, tmax_loc
call unlock@ (IO_LOCK)
end subroutine searchX
|
|
|
Back to top |
|
|
JohnCampbell
Joined: 16 Feb 2006 Posts: 2556 Location: Sydney
|
Posted: Mon Dec 11, 2017 12:59 pm Post subject: |
|
|
The test routine is: Code: | subroutine do_tests
include<windows.ins>
real*4 tserial(2), tparallel(2)
integer(kind=7) h1,h2,h3,h4, hh(4)
integer kk, j, arg(4)
r1 = 0.295d0 ; r2 = 0.150d0 ; x1 = 0.510d0 ; x2 = 0.210d0 ; v1 = 200.d0/sqrt(3.d0) ; ns = 1200.d0
max_threads = 4
do kk = 1,2
write(6,*) ' '
write(6,11) 'Serial ', kk
call time_start
if ( kk==2 ) then
ds = 1.d-9
call search1 (1)
call search1 (2)
call search1 (3)
call search1 (4)
else
ds = 1.d-8
call searchX (1)
call searchx (2)
call searchx (3)
call searchx (4)
end if
call time_end
tserial = times
write(6,11) ' Serial ', kk, ' Elapsed time ', tserial
11 format (a,i2,a,2f9.5)
write(6,11) 'Parallel', kk
call time_start
if ( kk==2 ) then
h1 = start_thread@ (search1,1)
h2 = start_thread@ (search1,2)
h3 = start_thread@ (search1,3)
h4 = start_thread@ (search1,4)
else
h1 = start_thread@ (searchX,1)
h2 = start_thread@ (searchx,2)
h3 = start_thread@ (searchx,3)
h4 = start_thread@ (searchx,4)
end if
call wait_for_thread@ (0)
call time_end
tparallel = times
write(6,11) ' Parallel', kk, ' Elapsed time ', tparallel
write(6,12) ' x increase in speed ', tserial/tparallel
if ( kk==1 ) then
j = maxloc( tmax(1:max_threads), 1 )
write (*,*) 'max val is',tmax(j),' at s=',smax(j)
end if
! Test of calling start_thread@ in a DO loop
! hh(j) = start_thread@ (search1,j) failed, as j is changing its value
! while arg(j) appears to transfer correct address
!
write(6,11) 'Test arg', kk
call time_start
if ( kk==2 ) then
do j = 1,max_threads
arg(j) = j
hh(j) = start_thread@ (search1,arg(j))
end do
else
do j = 1,max_threads
arg(j) = j
hh(j) = start_thread@ (searchx,arg(j))
end do
end if
call wait_for_thread@ (0)
call time_end
tparallel = times
write(6,11) ' Test arg', kk, ' Elapsed time ', tparallel
write(6,12) ' x increase in speed ', tserial/tparallel
if ( kk==1 ) then
j = maxloc( tmax(1:max_threads), 1 )
write (*,*) 'max val is',tmax(j),' at s=',smax(j)
end if
12 format (a,2f9.5)
end do
end subroutine do_tests
end module data_mod
program main1
use data_mod
write (*,*) '[Ken_thread2b.f90 ]'
call do_tests
end |
I ran it on my pc that supports 4 threads. |
|
Back to top |
|
|
PaulLaidler Site Admin
Joined: 21 Feb 2005 Posts: 7933 Location: Salford, UK
|
Posted: Mon Dec 11, 2017 4:35 pm Post subject: |
|
|
Ken's original code raises two further issues.
1) The code fragment
Code: | call lock@(ithread)
ds_loc = ds
call unlock@(ithread) |
appears in two places where the actual value of ithread is different. This is probably not what is intended. In working code (rather than this cut-down demo) the variable ds would be shared and needing read/write protection. In which case the locking ID would need to be the same in both instances.
2) The other issue is for us to sort out. The use of start_thread@ in a 64 bit environment raises the question: what makes a call "thread safe" for 64 bits? We need to look into this and (at least) provide some documentation that explains what kind of call is considered to be safe. |
|
Back to top |
|
|
John-Silver
Joined: 30 Jul 2013 Posts: 1520 Location: Aerospace Valley
|
Posted: Mon Dec 11, 2017 6:53 pm Post subject: |
|
|
how can x64 possibly 60% slower than x32 ?
Browsing haphazardly through the on-line doucmentation for Threading I note that there it's all seems to be tied to .NET programming, why is that ?
I also note that threading for implementing multi-processing appears to be non-automatic in that it needs a lot of knowledge to start with about potions of a code that could benefit it and then careful d�segregation of such code and 'enveloping' in the necessary threading code.
It seams, like most too-good-to-be-true mega-methods, laden with potential penelope pitfalls, not to mention dastardley Dan devils, until it evolves to be peter perfect.
(This is a global observation not a critique in any way of ftn95) |
|
Back to top |
|
|
PaulLaidler Site Admin
Joined: 21 Feb 2005 Posts: 7933 Location: Salford, UK
|
Posted: Mon Dec 11, 2017 8:49 pm Post subject: |
|
|
John-Silver
Some of the results in this Forum thread are erroneous. If correct results are 60% slower then that will be a particular case. In general my experience is that 64 bit results are usually on a par with 32 bit results (for speed).
The real motivation for moving to 64 bits is the increase in address space.
start_thread@ is for Win32 and x64 and not .NET.
Yes, multi-threading can be tricky. |
|
Back to top |
|
|
Kenneth_Smith
Joined: 18 May 2012 Posts: 697 Location: Hamilton, Lanarkshire, Scotland.
|
Posted: Mon Dec 11, 2017 9:53 pm Post subject: |
|
|
Paul, John,
Thanks to you both for comments and detective work.
I can see how to make this work now.
Ken |
|
Back to top |
|
|
JohnCampbell
Joined: 16 Feb 2006 Posts: 2556 Location: Sydney
|
Posted: Tue Dec 12, 2017 1:27 am Post subject: |
|
|
Based on the examples I have posted, there does not appear to be significant performance problems with 64-bit multi-threading.
I have used two of Ken's examples, I called ;
Search1 : simple loop count, where 64-bit is faster
SearchX : loop of COMPLEX calculation, where 64-bit is slower.
I have not identified the problem, but I suspect it is the use of cmplx (x,y,kind=dp) or COMPLEX i_loc in the following:
i_loc = v1/cmplx((r1+r2/s_loc),(x1+x2),kind=dp)
t_loc = (k_loc)*((abs(i_loc)**2)*(r2/s_loc))
I rarely use complex type variables, so am not familiar with this type of problem.
My "Test arg" example posted above is an interesting example of multi-threading, as for my first try below with j as the arg value, this fails as the variable "j" changes through the loop and so changes in each of the initiated threads. This is because the same address of the variable j is transferred to each thread:
do j = 1,max_threads
hh(j) = start_thread@ (searchx,j)
end do
By changing to arg(j), a unique address for arg(j) is transferred to each thread and so it appears to work ok
do j = 1,max_threads
arg(j) = j
hh(j) = start_thread@ (searchx,arg(j))
end do
Further tests need to be done for varying max_threads to identify the multi-thread efficiency and overheads. My pc's support up to 4 or 8 threads. ( It has been a long time since I used a pc that supported only 2 threads. )
John |
|
Back to top |
|
|
John-Silver
Joined: 30 Jul 2013 Posts: 1520 Location: Aerospace Valley
|
Posted: Wed Dec 13, 2017 6:56 am Post subject: |
|
|
Paul wrote:
Quote: | If correct results are 60% slower then that will be a particular case |
I was referring to John C's results for Example Code 1 (both serial & parallel results are 60% longer) so they are not erroneous results.
John has since commented above. |
|
Back to top |
|
|
JohnCampbell
Joined: 16 Feb 2006 Posts: 2556 Location: Sydney
|
Posted: Wed Dec 13, 2017 10:39 am Post subject: |
|
|
Thanks to Ken for providing an example of using multi-threading.
I have taken this and produced a program that tests multi-threading and demonstrates:
# how to vary the number of threads and call in a DO loop
# DO loop approach allows for variable number of threads.
# use the same thread-safe routine for multiple threads, using local private variables and shared variables in a module.
# how to transfer a unique argument to each thread call.
# how to share work between threads to improve performance
The attached example works for both 32-bit and 64-bit applications.
I hope it could be helpful for others to create useful solutions.
John
https://www.dropbox.com/s/5u4ojctkshhq87z/ken_thread_test4.zip?dl=0 |
|
Back to top |
|
|
PaulLaidler Site Admin
Joined: 21 Feb 2005 Posts: 7933 Location: Salford, UK
|
Posted: Wed Dec 13, 2017 2:57 pm Post subject: |
|
|
The failure of 64 bit CLOCK@ (and DCLOCK@) has now been fixed for the next release of clearwin64.dll. |
|
Back to top |
|
|
JohnCampbell
Joined: 16 Feb 2006 Posts: 2556 Location: Sydney
|
Posted: Fri Apr 05, 2019 2:27 am Post subject: |
|
|
Ken,
Have you made any progress with the threading ?
It would be good to be able to use a DO loop to manage the number of available threads.
We could have a DO for the threads available or tasks to perform and then associate the thread number with each task.
I am having a bit of a problem managing the private do index for each thread.
Interested to hear how you proceeded.
John |
|
Back to top |
|
|
Kenneth_Smith
Joined: 18 May 2012 Posts: 697 Location: Hamilton, Lanarkshire, Scotland.
|
Posted: Fri Apr 05, 2019 10:22 am Post subject: |
|
|
John,
Afraid this got put on the back burner last year when I was involved in some work for an arbitration - which consumed all my time. Thereafter I decided it was time for a change in the direction of my career, so I gave up work at the end of 2018. Now have my own one man business up and running and I am presently focusing on new clients - with some success , but not yet found the time to come back to this, although I do have a long list of "what happens if" scenarios I need to test. I will get back to this - once the business clears the Director's Loan and pays me a dividend (July hopefully).
Ken |
|
Back to top |
|
|
JohnCampbell
Joined: 16 Feb 2006 Posts: 2556 Location: Sydney
|
Posted: Sun Aug 30, 2020 11:40 am Post subject: |
|
|
Paul, Ken and others,
I have posted a multi-thread example using !$OMP in http://forums.silverfrost.com/viewtopic.php?t=4297&start=15
I am wondering how well this may be reproduced in the FTN95 parallel processing approach.
The basic !$OMP PARALLEL DO loop approach is a minimal approach for doing parallel processing. The code example is: Code: | call omp_set_num_threads (4)
!
!$OMP PARALLEL DO &
!$OMP& SHARED ( block_array_records, max_blocks ) &
!$OMP& PRIVATE ( i ) &
!$OMP& SCHEDULE (DYNAMIC)
do i = 1, max_blocks
if ( block_array_records(i)%block_size <= 0 ) cycle
!
call process_block ( i, block_array_records(i)%block_size, block_array_records(i)%block )
!
end do ! i
!$OMP END PARALLEL DO |
In this approach, a DO loop is processed using multiple threads.
�i� : the DO loop index, is a special private variable, unique to each thread/process, so that I, the DO index, has a different memory address for each thread/process. I have struggled with this in my FTN95 testing.
�Max_blocks� defines the loop count, so a variable defines the number of thread events to be processed, while �call omp_set_num_threads (4)� defines the number of threads that process these events.
To package each event, the task is processed through �call process_block�
An important distinction in OpenMP is between SHARED and PRIVATE variables. This could be managed in FTN95 and OpenMP via call process_block by having all shared variables/arrays as arguments to the routine, while all private variables/arrays are declared as local in the called routine (except for Private "i"). Can FTN95 allow a general routine like process_block, with flexibility in the arguments.
Returned values can be either in the shared arrays or as a shared accumulator. !$OMP& REDUCTION(+ : n_dot) accumulation could be emulated via an argumrent array n_dot(Max_blocks) to return values and sum after the end of the loop.
The argument "block_array_records(i)%block_size" provides a unique address to the routine, using "i".
While processing, we need to know both the loop counter �I� and the thread id �id = omp_get_thread_num ()�
The allocation of threads to each loop itteration is via loop "!$OMP& SCHEDULE (DYNAMIC)" in this the next itteration "i" is allocated to the next available thread, when they become available. SCHEDULE (STATIC) is an alternative where each loop itteration "i" has a pre-defined thread "id". These two alternative thread allocation cases would be necessary for load management between threads.
Are you able to comment on how some of these approaches are available or may be available in multi-core processors with 64 bit FTN95. |
|
Back to top |
|
|
PaulLaidler Site Admin
Joined: 21 Feb 2005 Posts: 7933 Location: Salford, UK
|
Posted: Sun Aug 30, 2020 2:08 pm Post subject: |
|
|
John
Sorry but my knowledge of this subject is very limited. |
|
Back to top |
|
|
Kenneth_Smith
Joined: 18 May 2012 Posts: 697 Location: Hamilton, Lanarkshire, Scotland.
|
Posted: Mon Aug 31, 2020 9:57 am Post subject: |
|
|
John, this is a variation on one of the examples for the parallel processing approach. Unlike Gfortran, with FTN95 you cannot simply define a section of code to be executed in parallel. So all serial code prior to the parallel section must be within the IF( .not. IsSlaveProcess@()) THEN ...... END IF block.
It took me ages to get this example to work this way, and then I went off to do something else and never came back to it.
Code: | program main
implicit none
INCLUDE <windows.ins>
DOUBLE PRECISION start_time,end_time,sum
double precision duration, sum1
DOUBLE PRECISION,allocatable::partial_answer(:)
INTEGER(kind=4) ID
INTEGER(kind=4) k
integer(kind=4) :: np=4, i, j
!>> TEST TO FIND MAIN PROCESS. Note if IF/ENDIF is commented out, the subroutine is called NP times
IF( .not. IsSlaveProcess@()) THEN
call set_parameters(np)
ENDIF
!>> Start np-1 additional tasks. ID will be returned thus:
!>> Master task ID=0
!>> Slave task ID=1,2,3 in the different processes
ID=GetParallelTaskID@(np-1) !##
IF(ID .eq. 0) print*, 'Number of processors', np
!>> Allocate a shared array. The string "AUTO" couples the ALLOCATE with the parallel task mechanism
ALLOCATE(partial_answer(np),SHARENAME="shared_stuff")
CALL TaskSynchronise@()
!>> Time the task using wall clock elapsed time
CALL dclock@(start_time)
sum=0d0
!>> All np processes compute the sum in an interleaved fashion
k = 10000000000_4 - ID
WHILE(k > 0)DO
sum = sum + k
k = k - np
ENDWHILE
!>> Copy the partial sum into the array shared between the processes
partial_answer(ID+1)=sum
CALL TaskSynchronise@()
CALL dclock@(end_time)
IF(ID==0)THEN
!>> We are the master task, so print out the results and the timing
sum1 = 0.d0
do i = 1, np
sum1 = sum1 + partial_answer(i)
end do
PRINT *,"Sum=",sum1
duration=end_time-start_time
PRINT *,"Parallel computation time = ",duration
ENDIF
CALL TaskSynchronise@()
!>> Kill off the slave process
IF(ID .ne. 0) STOP
DEALLOCATE(partial_answer)
END PROGRAM
subroutine set_parameters(np)
implicit none
integer(kind=4), intent(out) :: np
10 write(6,*)
write(6,*) 'Enter number of processors to use'
read(5,*) np
if (np .lt. 1) goto 10
end set_parameters |
|
|
Back to top |
|
|
|
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot vote in polls in this forum
|
Powered by phpBB © 2001, 2005 phpBB Group
|