Hello,
I would like to compare the execution times of two 32 bit applications which deal with operations on matrices A,B,C and only differ in the way the matrices are defined. In the first application named bench.exe the matrices are defined in a common block, in the second application named bench_mod.exe the matrices are defined using attribute 'allocatable' and allocated via call 'allocate'.
Both applications are generated from the same code base named bench.for using ftn95. They are compiled with the preprocessor flag USE_ALLOCATE set to value 1 or not set at all.
Now I compared several execution times produced during runtime and observed that some of the execution times of application bench_mod.exe were up to 35 % greater than the corresponding execution times of bench.exe.
Is there any experience or estimation about additional execution times when moving arrays defined on common blocks to arrays defined via attribute 'allocatable'?
The background is the following: when porting a 32 bit application to 64 bit I need to define arrays (of potentally big size) which are defined in common blocks for the 32 bit version and which are defined as arrays with attribute 'allocatable' and later allocated for the 64 bit version. I would like to use this technique for the 32 bit verison, as well, but only if the execution time would not increase too much.
The code for both applications is
options (INTL)
C
C
C Quelle mit SAVE ZEROISE OPTIMIZE compilieren
C
C BENCHMARK
C 486 o 486 o 860
C cpu-Zeiten 25 MHZ 33 MHz
C Teil 1 0.3
C Teil 2 71.4 56.0 16.0
C Teil 3 32.7
C Teil 4 82.8
C
C Zusatz (o) fr Compileroption 'optimize'
C
C Tests von 03/2001
C
C 300 MHz 800 MHz ADLON
C bench
C
#ifdef USE_ALLOCATE
module bench_module
PARAMETER ( LPX = 2000 )
REAL
*, allocatable ::
* A(:,:)
*, B(:,:)
*, C(:,:)
contains
subroutine init_arr(ierr)
integer*4 ierr
allocate(A(LPX,LPX),stat=ierr)
if (ierr .ne.0) then
write(*,*) 'Fehler beim Allokieren von Array A(:,:)'
return
endif
allocate(B(LPX,LPX),stat=ierr)
if (ierr .ne.0) then
write(*,*) 'Fehler beim Allokieren von Array B(:,:)'
return
endif
allocate(C(LPX,LPX),stat=ierr)
if (ierr .ne.0) then
write(*,*) 'Fehler beim Allokieren von Array C(:,:)'
return
endif
return
end subroutine init_arr
end module bench_module
#endif
PROGRAM BENCH
#ifdef USE_ALLOCATE
use bench_module
#else
PARAMETER ( LPX = 2000 )
REAL A(LPX,LPX),B(LPX,LPX),C(LPX,LPX)
COMMON /COM001/ A,B,C
#endif
REAL*8 WERT,SUMF,FMAX
COMMON /COMIO/ KT,KL
#ifdef USE_ALLOCATE
write(*,*) 'A,B,C allocated? '
* , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C)
call init_arr(ierr)
write(*,*) 'A,B,C allocated? '
* , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C)
#endif
ka=6
KT=2
KL=1
T1=0.0
C open(ka,file='BENCH.PRO')
CALL CPUTIM(T1)
T0=T1
LPM=LPX
DO 10 J1 = 1,LPM
DO 12 J2 = 1,LPM
B(J1,J2)=FLOAT(J2)
12 A(J1,J2)=FLOAT(J2)
B(J1,J1)=2.0*B(J1,J1)
10 A(J1,J1)=2.0*A(J1,J1)
CALL CPUTIM(T1)
WRITE(KT,1001) T1-T0
C WRITE(ka,1001) T1-T0
T0=T1
aw=1.1
bw=2.2
cw=3.3
dw=4.4
ew=5.5
do 16 j2=1,10
do 14 j1=1,30000
14 if(aw*bw*cw*dw*ew*j1.eq.-1.) goto 18
16 continue
18 CALL CPUTIM(T1)
WRITE(KT,1001) T1-T0
C WRITE(ka,1001) T1-T0
T0=T1
WRITE(KT,1010)
C WRITE(ka,1010)
1010 FORMAT(' MATRIX AUFBEREITET !')
CALL POSINV(B,LPM)
CALL CPUTIM(T1)
WRITE(KT,1001) T1-T0
C WRITE(ka,1001) T1-T0
T0=T1
WRITE(KT,1011)
C WRITE(ka,1011)
1011 FORMAT(' MATRIX INVERTIERT !')
FMAX=0.0
SUMF=0.0
DO 20 J1 = 1,LPM
DO 20 J2 = 1,LPM
WERT=0.0
DO 24 L2 = 1,LPM
24 WERT=WERT+A(J1,L2)*B(L2,J2)
C(J1,J2)=WERT
IF(J1.EQ.J2) WERT=WERT-1D0
WERT=DABS(WERT)
SUMF=SUMF+WERT
IF(WERT.GT.FMAX) FMAX=WERT
20 CONTINUE
SUMF=SUMF/LPM/LPM
WRITE(KT,1020) FMAX,SUMF
C WRITE(ka,1020) FMAX,SUMF
1020 FORMAT(' GROESSTER EINZELFEHLER:',E12.5,/,
*' MITTLERER FEHLER :',E12.5,/)
1001 FORMAT(' cpu-Zeit:',F10.3)
CALL CPUTIM(T1)
WRITE(KT,1001) T1-T0
C WRITE(ka,1001) T1-T0
T0=T1
C
C Multiplikation ohne Vektorm”glichkeit
FMAX=0.0
SUMF=0.0
I=+1
DO 30 J1 = 1,LPM
DO 30 J2 = 1,LPM
WERT=0.0
DO 34 L2 = 1,LPM
I=-I
IF(L2.GT.J1) THEN
IF(I.GT.0) THEN
WERT=WERT-A(L2,J1)*B(L2,J2)
ELSE
IF(B(L2,J2).GT..00001) WERT=WERT-A(L2,J1)/B(L2,J2)
ENDIF
ELSE
IF(I.LT.0) THEN
WERT=WERT+A(J1,L2)*B(L2,J2)
ELSE
IF(B(L2,J2).GT..00001) WERT=WERT+A(J1,L2)/B(L2,J2)
ENDIF
ENDIF
34 CONTINUE
C(J1,J2)=WERT
SUMF=SUMF+DABS(WERT)
30 CONTINUE
WRITE(KT,1030) SUMF
C WRITE(ka,1030) SUMF
CALL CPUTIM(T1)
WRITE(KT,1001) T1-T0
C WRITE(ka,1001) T1-T0
T0=T1
C close(ka)
stop
1030 FORMAT(' Testwert fr Teil4:',E15.5)
END
C POSINV
SUBROUTINE POSINV(A,NMAX)
LOGICAL LOGWRI
DIMENSION A(NMAX,NMAX)
C
DO 30 N=1,NMAX
D=1./A(N,N)
DO 10 J=1,NMAX
A(J,N)=-A(J,N)*D
10 CONTINUE
C
DO 25 I=1,NMAX
A1=A(N,I)
IF(A1.EQ..0)GOTO 25
IF(N.EQ.I) GOTO 20
DO 15 J=1,NMAX
IF(N.EQ.J) GOTO 15
C A2=A(J,N)
C IF(A2.EQ..0) GOTO 15
C A(J,I)=A(J,I)+A1*A2
if(a(j,n).ne..0) A(J,I)=A(J,I)+A1*A(J,N)
15 CONTINUE
20 A(N,I)=A1*D
25 CONTINUE
C
A(N,N)=D
C
30 CONTINUE
C
RETURN
END
C
C CPUTIM
SUBROUTINE CPUTIM (T)
C
C Gib Clock-Zeit in sec
CALL CLOCK@(T) /* Systemroutine ist ggfs. zu ersetzen
RETURN
END
It is compiled via commands
ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /DEFINE USE_ALLOCATE 1 /-windows /LINk copy bench.exe bench_mod.exe
and
ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /-windows /LINk
Any comments are appreciated.
Regards, Dietmar