Silverfrost Forums

Welcome to our forums

Arrays in common blocks versus allocatable arrays

10 Jul 2014 10:52 #14321

Hello,

I would like to compare the execution times of two 32 bit applications which deal with operations on matrices A,B,C and only differ in the way the matrices are defined. In the first application named bench.exe the matrices are defined in a common block, in the second application named bench_mod.exe the matrices are defined using attribute 'allocatable' and allocated via call 'allocate'.

Both applications are generated from the same code base named bench.for using ftn95. They are compiled with the preprocessor flag USE_ALLOCATE set to value 1 or not set at all.

Now I compared several execution times produced during runtime and observed that some of the execution times of application bench_mod.exe were up to 35 % greater than the corresponding execution times of bench.exe.

Is there any experience or estimation about additional execution times when moving arrays defined on common blocks to arrays defined via attribute 'allocatable'?

The background is the following: when porting a 32 bit application to 64 bit I need to define arrays (of potentally big size) which are defined in common blocks for the 32 bit version and which are defined as arrays with attribute 'allocatable' and later allocated for the 64 bit version. I would like to use this technique for the 32 bit verison, as well, but only if the execution time would not increase too much.

The code for both applications is

      options (INTL)
C
C
C     Quelle mit SAVE ZEROISE OPTIMIZE  compilieren
C
C     BENCHMARK
C                  486 o    486 o   860
C     cpu-Zeiten  25 MHZ   33 MHz
C     Teil 1       0.3
C     Teil 2      71.4     56.0    16.0 
C     Teil 3      32.7     
C     Teil 4      82.8
C
C     Zusatz (o) fr Compileroption 'optimize'
C
C     Tests von 03/2001
C
C              300 MHz   800 MHz   ADLON
C     bench    
C

#ifdef USE_ALLOCATE

      module bench_module

      PARAMETER ( LPX = 2000 )
            
      REAL
     *, allocatable ::
     *  A(:,:)
     *, B(:,:)
     *, C(:,:)
      
      contains 
       subroutine init_arr(ierr)
        integer*4 ierr
        allocate(A(LPX,LPX),stat=ierr)
        if (ierr .ne.0) then
      	  write(*,*) 'Fehler beim Allokieren von Array A(:,:)'
      	  return
        endif
        allocate(B(LPX,LPX),stat=ierr)
        if (ierr .ne.0) then
      	  write(*,*) 'Fehler beim Allokieren von Array B(:,:)'
      	  return
        endif
        allocate(C(LPX,LPX),stat=ierr)
        if (ierr .ne.0) then
      	  write(*,*) 'Fehler beim Allokieren von Array C(:,:)'
      	  return
        endif
        return        
       end subroutine init_arr     
      end module bench_module
      
#endif

      PROGRAM BENCH

#ifdef USE_ALLOCATE
      use bench_module
#else
      PARAMETER ( LPX = 2000 )
      REAL A(LPX,LPX),B(LPX,LPX),C(LPX,LPX)
      COMMON /COM001/ A,B,C
#endif
      REAL*8 WERT,SUMF,FMAX
      COMMON /COMIO/ KT,KL
#ifdef USE_ALLOCATE
      write(*,*) 'A,B,C allocated? '
     *           , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C)
      call init_arr(ierr)
      write(*,*) 'A,B,C allocated? '
     *           , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C)
#endif      
      ka=6
      KT=2
      KL=1
      T1=0.0
C      open(ka,file='BENCH.PRO')
      CALL CPUTIM(T1)
      T0=T1
      LPM=LPX
      DO 10 J1 = 1,LPM
      DO 12 J2 = 1,LPM
      B(J1,J2)=FLOAT(J2)
   12 A(J1,J2)=FLOAT(J2)
      B(J1,J1)=2.0*B(J1,J1)
   10 A(J1,J1)=2.0*A(J1,J1)
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1	    
      aw=1.1
      bw=2.2
      cw=3.3
      dw=4.4
      ew=5.5
      do 16 j2=1,10
      do 14 j1=1,30000
14    if(aw*bw*cw*dw*ew*j1.eq.-1.) goto 18
16    continue

18    CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
      WRITE(KT,1010)
C      WRITE(ka,1010)
 1010 FORMAT(' MATRIX AUFBEREITET !')
      CALL POSINV(B,LPM)
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
      WRITE(KT,1011)
C      WRITE(ka,1011)
 1011 FORMAT(' MATRIX INVERTIERT !')
      FMAX=0.0
      SUMF=0.0
      DO 20 J1 = 1,LPM
      DO 20 J2 = 1,LPM
      WERT=0.0
      DO 24 L2 = 1,LPM
   24 WERT=WERT+A(J1,L2)*B(L2,J2)
      C(J1,J2)=WERT
      IF(J1.EQ.J2) WERT=WERT-1D0
      WERT=DABS(WERT)
      SUMF=SUMF+WERT
      IF(WERT.GT.FMAX) FMAX=WERT
   20 CONTINUE
      SUMF=SUMF/LPM/LPM
      WRITE(KT,1020) FMAX,SUMF
C      WRITE(ka,1020) FMAX,SUMF
 1020 FORMAT(' GROESSTER EINZELFEHLER:',E12.5,/,
     *' MITTLERER FEHLER      :',E12.5,/)
 1001 FORMAT(' cpu-Zeit:',F10.3)
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
C
C     Multiplikation ohne Vektorm”glichkeit
      FMAX=0.0
      SUMF=0.0
      I=+1
      DO 30 J1 = 1,LPM
      DO 30 J2 = 1,LPM
      WERT=0.0
      DO 34 L2 = 1,LPM
      I=-I
      IF(L2.GT.J1) THEN
       IF(I.GT.0) THEN
        WERT=WERT-A(L2,J1)*B(L2,J2)
       ELSE
        IF(B(L2,J2).GT..00001) WERT=WERT-A(L2,J1)/B(L2,J2)
       ENDIF
      ELSE
       IF(I.LT.0) THEN
        WERT=WERT+A(J1,L2)*B(L2,J2)
       ELSE
        IF(B(L2,J2).GT..00001) WERT=WERT+A(J1,L2)/B(L2,J2)
       ENDIF
      ENDIF
   34 CONTINUE
      C(J1,J2)=WERT
      SUMF=SUMF+DABS(WERT)
   30 CONTINUE
      WRITE(KT,1030) SUMF
C      WRITE(ka,1030) SUMF
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
C      close(ka)
      stop
 1030 FORMAT(' Testwert fr Teil4:',E15.5)
      END
C POSINV
      SUBROUTINE POSINV(A,NMAX)
      LOGICAL LOGWRI
      DIMENSION A(NMAX,NMAX)
C
      DO 30 N=1,NMAX
      D=1./A(N,N)
      DO 10 J=1,NMAX
      A(J,N)=-A(J,N)*D
   10 CONTINUE
C
      DO 25 I=1,NMAX
      A1=A(N,I)
      IF(A1.EQ..0)GOTO 25
      IF(N.EQ.I) GOTO 20
      DO 15 J=1,NMAX
      IF(N.EQ.J) GOTO 15
C      A2=A(J,N)
C      IF(A2.EQ..0) GOTO 15
C      A(J,I)=A(J,I)+A1*A2
       if(a(j,n).ne..0) A(J,I)=A(J,I)+A1*A(J,N)
   15 CONTINUE
   20 A(N,I)=A1*D
   25 CONTINUE
C
      A(N,N)=D
C
   30 CONTINUE
C
      RETURN
      END
C
C CPUTIM
      SUBROUTINE CPUTIM (T)
C
C     Gib Clock-Zeit in sec
      CALL CLOCK@(T)                                                    /* Systemroutine ist ggfs. zu ersetzen
      RETURN
      END

It is compiled via commands

ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /DEFINE USE_ALLOCATE 1 /-windows /LINk copy bench.exe bench_mod.exe

and

ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /-windows /LINk

Any comments are appreciated.

Regards, Dietmar

10 Jul 2014 11:00 #14322

Sorry, for some reason my latest post has not been posted completely. Here the code again:

      options (INTL)
C
C
C     Quelle mit SAVE ZEROISE OPTIMIZE  compilieren
C
C     BENCHMARK
C                  486 o    486 o   860
C     cpu-Zeiten  25 MHZ   33 MHz
C     Teil 1       0.3
C     Teil 2      71.4     56.0    16.0 
C     Teil 3      32.7     
C     Teil 4      82.8
C
C     Zusatz (o) fr Compileroption 'optimize'
C
C     Tests von 03/2001
C
C              300 MHz   800 MHz   ADLON
C     bench    
C

#ifdef USE_ALLOCATE

      module bench_module

      PARAMETER ( LPX = 2000 )
            
      REAL
     *, allocatable ::
     *  A(:,:)
     *, B(:,:)
     *, C(:,:)
      
      contains 
       subroutine init_arr(ierr)
        integer*4 ierr
        allocate(A(LPX,LPX),stat=ierr)
        if (ierr .ne.0) then
      	  write(*,*) 'Fehler beim Allokieren von Array A(:,:)'
      	  return
        endif
        allocate(B(LPX,LPX),stat=ierr)
        if (ierr .ne.0) then
      	  write(*,*) 'Fehler beim Allokieren von Array B(:,:)'
      	  return
        endif
        allocate(C(LPX,LPX),stat=ierr)
        if (ierr .ne.0) then
      	  write(*,*) 'Fehler beim Allokieren von Array C(:,:)'
      	  return
        endif
        return        
       end subroutine init_arr     
      end module bench_module
      
#endif

      PROGRAM BENCH

#ifdef USE_ALLOCATE
      use bench_module
#else
      PARAMETER ( LPX = 2000 )
      REAL A(LPX,LPX),B(LPX,LPX),C(LPX,LPX)
      COMMON /COM001/ A,B,C
#endif
      REAL*8 WERT,SUMF,FMAX
      COMMON /COMIO/ KT,KL
#ifdef USE_ALLOCATE
      write(*,*) 'A,B,C allocated? '
     *           , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C)
      call init_arr(ierr)
      write(*,*) 'A,B,C allocated? '
     *           , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C)
#endif      
      ka=6
      KT=2
      KL=1
      T1=0.0
C      open(ka,file='BENCH.PRO')
      CALL CPUTIM(T1)
      T0=T1
      LPM=LPX
      DO 10 J1 = 1,LPM
      DO 12 J2 = 1,LPM
      B(J1,J2)=FLOAT(J2)
   12 A(J1,J2)=FLOAT(J2)
      B(J1,J1)=2.0*B(J1,J1)
   10 A(J1,J1)=2.0*A(J1,J1)
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1	    
      aw=1.1
      bw=2.2
      cw=3.3
      dw=4.4
      ew=5.5
      do 16 j2=1,10
      do 14 j1=1,30000
14    if(aw*bw*cw*dw*ew*j1.eq.-1.) goto 18
16    continue

18    CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
      WRITE(KT,1010)
C      WRITE(ka,1010)
 1010 FORMAT(' MATRIX AUFBEREITET !')
      CALL POSINV(B,LPM)
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
      WRITE(KT,1011)
C      WRITE(ka,1011)
 1011 FORMAT(' MATRIX INVERTIERT !')
      FMAX=0.0
      SUMF=0.0
      DO 20 J1 = 1,LPM
      DO 20 J2 = 1,LPM
      WERT=0.0
      DO 24 L2 = 1,LPM
   24 WERT=WERT+A(J1,L2)*B(L2,J2)
      C(J1,J2)=WERT
      IF(J1.EQ.J2) WERT=WERT-1D0
      WERT=DABS(WERT)
      SUMF=SUMF+WERT
      IF(WERT.GT.FMAX) FMAX=WERT
   20 CONTINUE
      SUMF=SUMF/LPM/LPM
      WRITE(KT,1020) FMAX,SUMF
C      WRITE(ka,1020) FMAX,SUMF
 1020 FORMAT(' GROESSTER EINZELFEHLER:',E12.5,/,
     *' MITTLERER FEHLER      :',E12.5,/)
 1001 FORMAT(' cpu-Zeit:',F10.3)
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
C
C     Multiplikation ohne Vektorm”glichkeit
      FMAX=0.0
      SUMF=0.0
      I=+1
      DO 30 J1 = 1,LPM
      DO 30 J2 = 1,LPM
      WERT=0.0
      DO 34 L2 = 1,LPM
      I=-I
      IF(L2.GT.J1) THEN
       IF(I.GT.0) THEN
        WERT=WERT-A(L2,J1)*B(L2,J2)
       ELSE
        IF(B(L2,J2).GT..00001) WERT=WERT-A(L2,J1)/B(L2,J2)
       ENDIF
      ELSE
       IF(I.LT.0) THEN
        WERT=WERT+A(J1,L2)*B(L2,J2)
       ELSE
        IF(B(L2,J2).GT..00001) WERT=WERT+A(J1,L2)/B(L2,J2)
       ENDIF
      ENDIF
   34 CONTINUE
      C(J1,J2)=WERT
      SUMF=SUMF+DABS(WERT)
   30 CONTINUE
      WRITE(KT,1030) SUMF
C      WRITE(ka,1030) SUMF
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
C      close(ka)
      stop
 1030 FORMAT(' Testwert fr Teil4:',E15.5)
      END
C POSINV
      SUBROUTINE POSINV(A,NMAX)
      LOGICAL LOGWRI
      DIMENSION A(NMAX,NMAX)
C
      DO 30 N=1,NMAX
      D=1./A(N,N)
      DO 10 J=1,NMAX
      A(J,N)=-A(J,N)*D
   10 CONTINUE
C
      DO 25 I=1,NMAX
      A1=A(N,I)
      IF(A1.EQ..0)GOTO 25
      IF(N.EQ.I) GOTO 20
      DO 15 J=1,NMAX
      IF(N.EQ.J) GOTO 15
C      A2=A(J,N)
C      IF(A2.EQ..0) GOTO 15
C      A(J,I)=A(J,I)+A1*A2
       if(a(j,n).ne..0) A(J,I)=A(J,I)+A1*A(J,N)
   15 CONTINUE
   20 A(N,I)=A1*D
   25 CONTINUE
C
      A(N,N)=D
C
   30 CONTINUE
C
      RETURN
      END
C
C CPUTIM
      SUBROUTINE CPUTIM (T)
C
C     Gib Clock-Zeit in sec
      CALL CLOCK@(T)                                                    /* Systemroutine ist ggfs. zu ersetzen
      RETURN
      END

It is compiled via commands

ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /DEFINE USE_ALLOCATE 1 /-windows /LINk copy bench.exe bench_mod.exe

and

ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /-windows /LINk

Any comments are appreciated.

Regards, Dietmar

10 Jul 2014 11:12 #14323

Sorry once again, although the Preview looked quite well and as expected in the second post I made, the code has been cut once again. Hence I omit the code and continue after the code with my original post.

The code is compiled via commands

ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /DEFINE USE_ALLOCATE 1 /-windows /LINk copy bench.exe bench_mod.exe

and

ftn95 bench.for /OLD_ARRAYS /ALT_KINDS /ZEROISE /SAVE /optimize /CFPP /-windows /LINk

Any comments are appreciated.

Regards, Dietmar

10 Jul 2014 2:52 #14327

I don't know why this happens. One would not expect the access time to be different.

I would look at the command line options /SAVE and /ZEROIZE. These may change the results. For example, /ZEROIZE may have no effect on arrays allocated using ALLOCATE. So if your computation depends on automatic zero initial values, this may change the way the calculation proceeds.

11 Jul 2014 8:09 #14329

I removed options /SAVE and /ZEROIZE, but this did not help very much: the differences in the execution times were nearly the same.

What is interesting to my opinion is the following: in both executables I compute the execution times for 3 matrix operations. The ratio beween the CPU times of bench_mod.exe and bench.exe for the three operations are approximately

1.001 (1st operation) 1.005 (2nd operation) 1.400 (3rd operation)

and it is the third ratio which makes me wonder.

Regards, Dietmar

11 Jul 2014 8:43 #14330

Paul,

one more observation. In the third matrix opertion/calculation (using matrices A(i,j),B(i,j)) we use the indices i and j for computaion in different sequences than in the second matrix operation/calculation. We wonder if that might have some influence on the differences in cpu time.

I do not know if you are interested in the programme code. If so, then please tell me. I was not able to submit the code in the previous posts although the preview looked ok, hence I would ask you, if I should try via email (provided you are interested).

Thanks, Dietmar

11 Jul 2014 11:41 #14331

Dietmar,

Like you I am changing my code extensively to move from common to Modules with allocated arrays. The question about run time is not significant for me, as the change is mandatory. Some of the run time difference you observe could be that the allocation and initialising of memory with a fixed common occurs before the timing commences, although this should not be 0.3 seconds. I sometimes use an AUDIT program that runs outside the executable to time elapsed time of the program run. This is based on DATE_AND_TIME call and storing VALUES to file. Don't give up on Module/Allocatable arrays, as this structure offers much flexibility for coding. If you could post the 3 examples as two separate codes I would be interested in looking at the differences, say on dropbox or similar. ( You are using elapsed time with SYSTEM_CLOCK intrinsic and FTN95 ? ) There can be a number of reasons why FTN95 can run slowly, although I think you use the same computation code for all tests ? ( I should look at the code you posted, although the layout is lost. )

John

11 Jul 2014 4:10 #14332

John,

we use clock@ for time evaluation. I am trying to post the rest of the code for the moment.

Thanks, Dietmar

      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
C
C     Multiplikation ohne Vektorm”glichkeit
      FMAX=0.0
      SUMF=0.0
      I=+1
      DO 30 J1 = 1,LPM
      DO 30 J2 = 1,LPM
      WERT=0.0
      DO 34 L2 = 1,LPM
      I=-I
      IF(L2.GT.J1) THEN
       IF(I.GT.0) THEN
        WERT=WERT-A(L2,J1)*B(L2,J2)
       ELSE
        IF(B(L2,J2).GT..00001) WERT=WERT-A(L2,J1)/B(L2,J2)
       ENDIF
      ELSE
       IF(I.LT.0) THEN
        WERT=WERT+A(J1,L2)*B(L2,J2)
       ELSE
        IF(B(L2,J2).GT..00001) WERT=WERT+A(J1,L2)/B(L2,J2)
       ENDIF
      ENDIF
   34 CONTINUE
      C(J1,J2)=WERT
      SUMF=SUMF+DABS(WERT)
   30 CONTINUE
      WRITE(KT,1030) SUMF
C      WRITE(ka,1030) SUMF
      CALL CPUTIM(T1)
      WRITE(KT,1001) T1-T0
C      WRITE(ka,1001) T1-T0
      T0=T1
C      close(ka)
      stop
 1001 FORMAT(' cpu-Zeit:',F10.3)
 1030 FORMAT(' Testwert fr Teil4:',E15.5)
      END
C POSINV
      SUBROUTINE POSINV(A,NMAX)
      LOGICAL LOGWRI
      DIMENSION A(NMAX,NMAX)
C
      DO 30 N=1,NMAX
      D=1./A(N,N)
      DO 10 J=1,NMAX
      A(J,N)=-A(J,N)*D
   10 CONTINUE
C
      DO 25 I=1,NMAX
      A1=A(N,I)
      IF(A1.EQ..0)GOTO 25
      IF(N.EQ.I) GOTO 20
      DO 15 J=1,NMAX
      IF(N.EQ.J) GOTO 15
C      A2=A(J,N)
C      IF(A2.EQ..0) GOTO 15
C      A(J,I)=A(J,I)+A1*A2
       if(a(j,n).ne..0) A(J,I)=A(J,I)+A1*A(J,N)
   15 CONTINUE
   20 A(N,I)=A1*D
   25 CONTINUE
C
      A(N,N)=D
C
   30 CONTINUE
C
      RETURN
      END
C
C CPUTIM
      SUBROUTINE CPUTIM (T)
C
C     Gib Clock-Zeit in sec
      CALL CLOCK@(T)                                                    /* Systemroutine ist ggfs. zu ersetzen
      RETURN
      END
12 Jul 2014 7:07 #14333

Dietmar,

I managed to reproduce your results and got poor performance in the last test with ALLOCATE. However I investigated some more and combined the test into 1 program for both types of arrays and the problem disappeared ?? I think the problem related to calculation of array addresses for this particular test case.

  • I also noticed that /opt did not show much improvement ??
  • SYSTEM_CLOCK is preferred to clock@
  • I retained fixed format, which Eddie will appreciate.

Anyway, I made a few changes to your code to test in a different way, some of which might be of interest.

I shall post the modified code below: module bench_module

      integer*4, PARAMETER :: LPX = 2000
     
      REAL 
     *, allocatable :: 
     *  A(:,:) 
     *, B(:,:) 
     *, C(:,:) 
     
      contains 
      
      subroutine init_arr (ierr) 
      integer*4 ierr
      
      allocate (A(LPX,LPX),stat=ierr) 
      if (ierr .ne.0) then 
        write(*,*) 'Fehler beim Allokieren von Array A(:,:)' 
        return 
      end if
      
      allocate (B(LPX,LPX),stat=ierr) 
      if (ierr .ne.0) then 
        write(*,*) 'Fehler beim Allokieren von Array B(:,:)' 
        return 
      end if
      
      allocate (C(LPX,LPX),stat=ierr) 
      if (ierr .ne.0) then 
        write(*,*) 'Fehler beim Allokieren von Array C(:,:)' 
        return 
      end if 
      return 
      end subroutine init_arr
      
      end module bench_module 
     
!      #endif 

      PROGRAM BENCH_TEST

      use bench_module 
      
      REAL Ac(LPX,LPX),Bc(LPX,LPX),Cc(LPX,LPX) 
      COMMON /COM001/ Ac,Bc,Cc 
      integer*4 ierr

      CALL CPUTIM ('first call') 

      write(*,*) 'A,B,C allocated? ' 
     * , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C) 
      call init_arr(ierr) 
      write(*,*) 'A,B,C allocated? ' 
     * , ALLOCATED(A), ALLOCATED(B), ALLOCATED(C) 
      CALL CPUTIM ('Test 0 : Allocate arrays' )

      call bench ( a, b, c, LPX, 'ALLOCATABLE ARRAYS' )
 
      call bench ( ac, bc, cc, LPX, 'COMMON ARRAYS' )

      call bench ( a, b, c, LPX, 'ALLOCATABLE ARRAYS' )

      end

      subroutine bench ( a, b, c, LPX, description )
      
      integer*4 LPX
      REAL      A(LPX,LPX),B(LPX,LPX),C(LPX,LPX)
      character description*(*)

      REAL*8 WERT,SUMF,FMAX
      COMMON /COMIO/ KT,KL 
       integer*4 KT,KL
!
      integer*4 lpm,j1,j2,i,l2
      integer*8 n
      real*4    aw,bw,cw,dw,ew
!
      write (*,99) 'LOOP TEST USING ',description
   99 format (/80('+')/ a,a/ 80('+'))
!   
      CALL CPUTIM ('enter test') 
             
!      ka=6 
      KT=2 
      KL=1 
C open(ka,file='BENCH.PRO') 
!
      LPM=LPX 
      DO 10 J1 = 1,LPM 
        DO 12 J2 = 1,LPM 
          B(J1,J2)=FLOAT(J2) 
   12     A(J1,J2)=FLOAT(J2) 
        B(J1,J1)=2.0*B(J1,J1) 
   10   A(J1,J1)=2.0*A(J1,J1) 
      CALL CPUTIM ('Test 0 : Initialise arrays' )
!
!
      aw=1.1 
      bw=2.2 
      cw=3.3 
      dw=4.4 
      ew=5.5 
      n = 0
      do 16 j2=1,10 
        do 14 j1=1,30000 
          n = n+1
   14     if(aw*bw*cw*dw*ew*j1 == -1.) goto 18 
   16 continue 
!
   18 CALL CPUTIM ('Test 1 : DO loop test')
      write (*,*) n
!      
!
      WRITE (KT,1010) ' MATRIX AUFBEREITET !'
      CALL POSINV (B,LPM) 
      CALL CPUTIM ('Test 2 : POSINV')

! cut 1
 
12 Jul 2014 7:18 #14334

Hopefully the rest of the test ! cut 1

      WRITE(KT,1010) ' MATRIX INVERTIERT !'
      FMAX=0.0 
      SUMF=0.0 
      DO 20 J1 = 1,LPM 
        DO 20 J2 = 1,LPM 
          WERT=0.0 
          DO 24 L2 = 1,LPM 
   24       WERT = WERT + A(J1,L2)*B(L2,J2) 
          C(J1,J2) = WERT 
          IF (J1.EQ.J2) WERT=WERT-1D0 
          WERT=DABS(WERT) 
          SUMF=SUMF+WERT 
          IF(WERT.GT.FMAX) FMAX=WERT 
   20 CONTINUE 
      SUMF=SUMF/LPM/LPM 

      WRITE(KT,1020) FMAX,SUMF 
 1020 FORMAT(' GROESSTER EINZELFEHLER:',E12.5,/, 
     * ' MITTLERER FEHLER :',E12.5,/) 
      CALL CPUTIM ('Test 3 : MATRIX INVERTIERT') 
!      
C 
C     Multiplikation ohne Vektormglichkeit 
       FMAX=0.0 
       SUMF=0.0 
       I=+1 
       n = 0
       DO 30 J1 = 1,LPM 
         DO 30 J2 = 1,LPM 
           WERT=0.0 
           DO 34 L2 = 1,LPM 
             I=-I 
             IF(L2.GT.J1) THEN 
              IF(I.GT.0) THEN 
               WERT=WERT - A(L2,J1)*B(L2,J2)
               n = n + 1
              ELSE IF(B(L2,J2) > .00001) THEN
               WERT=WERT - A(L2,J1)/B(L2,J2) 
               n = n + 1
              ENDIF 
             ELSE 
              IF(I.LT.0) THEN 
               WERT=WERT + A(J1,L2)*B(L2,J2) 
               n = n + 1
              ELSE IF(B(L2,J2) > .00001) then
               WERT=WERT + A(J1,L2)/B(L2,J2) 
               n = n + 1
              ENDIF 
             ENDIF 
   34      CONTINUE 
           C(J1,J2)=WERT 
           SUMF=SUMF+DABS(WERT) 
   30 CONTINUE 
      WRITE(KT,1030) SUMF, n
      CALL CPUTIM ('Test 4 : Multiplikation ohne Vektormglichkeit') 
!
C     close(ka) 
      write (*,*) 'end  of test'
!      stop 
! 1001 FORMAT(' cpu-Zeit:',F10.3,a) 
 1010 FORMAT(/a) 
 1030 FORMAT(' Testwert fr Teil4:',E15.5, 2x,i0) 
      END 

C POSINV 
      SUBROUTINE POSINV(A,NMAX)
      INTEGER*4 nmax 
      REAL*4 A(NMAX,NMAX) 

!      LOGICAL LOGWRI 
      integer*4 N,J,I
      real*4    D,A1
C 
      DO 30 N=1,NMAX 
        D=1./A(N,N) 
        DO 10 J=1,NMAX 
          A(J,N)=-A(J,N)*D 
   10   CONTINUE 
C 
        DO 25 I=1,NMAX 
          A1=A(N,I) 
          IF(A1 == 0.0)GOTO 25 
          IF(N.EQ.I) GOTO 20 
          DO 15 J=1,NMAX 
            IF(N.EQ.J) GOTO 15 
C           A2=A(J,N) 
C           IF(A2.EQ..0) GOTO 15 
C           A(J,I)=A(J,I)+A1*A2 
            if(a(j,n) /= 0.0) A(J,I)=A(J,I)+A1*A(J,N) 
   15     CONTINUE 
   20     A(N,I)=A1*D 
   25   CONTINUE 
C 
        A(N,N)=D 
C 
   30 CONTINUE 
C 
      RETURN 
      END 
C 
C CPUTIM 
      SUBROUTINE CPUTIM (comment)
      character comment*(*)
!
      integer*4 COUNT, COUNT_RATE, COUNT_MAX
      REAL*4 :: t 
      real*4 :: lt  = -1.0
C 
C     Gib Clock-Zeit in sec 
      call SYSTEM_CLOCK (COUNT, COUNT_RATE, COUNT_MAX)
      T = real(count) / real(count_rate)
!      CALL CLOCK@ (T)           ! Systemroutine ist ggfs. zu ersetzen 
      if ( lt >= 0) write (*,1001) t-lt,comment
      lt = t
 1001 FORMAT(' cpu-Zeit:',F10.3,2x,a) 
      RETURN 
      END 

[/code]

12 Jul 2014 9:16 #14335

I investigated some more with FTN95 Ver 7.00 and found: using /opt test 4 actually runs slower using /opt does not improve these tests using real*8 test 4 with allocate runs about the same time Placing both allocation and common in the same test program, as I posted, the slowness of test 4 with allocate disappears, as the memory model info is lost. This is also the case with POSINV.

I think the answer for test 4 is that for local allocatable arrays, FTN95 changes the array subscript addressing calculation. The /opt slower performance could show this.

I also note that addressing of A and B is often not sequential. As the memory for the 3 arrays is 46mb, this would imply that for these cases cache usage would not be as effective and so slow down the run time, although FTN95 does not utilise cache as well as other compilers. ( I would change test 3 and store A in a transposed form to improve cache usage, and also use DavidB's real8 dot product function. I think there is also a real4 version somewhere in this forum )

Unfortunately, all this is just guess work !!

John

12 Jul 2014 12:39 #14336

I have further investigated the run time performance of this program using FTN95. There are a few options for improving the performance. Improve Memory Access: Use A’(j,i) ; the transposed matrix of A when operating on the “J” index. This improves cacheing and reduces the memory access delays. Improve subscript calculations: Use functions or subroutines for the inner loop to reduce array subscript calculations. Improve function operation by using functions that support SSE2 vector instructions. For Test 2 : subroutine POSINV the inner loops can be changed using appropriate functions to:

        DO I = 1,NMAX 
          A1 = A(N,I) 
          IF (A1 == 0) cycle
          IF (N /= I) then
            call Vec_Add_SSE ( A(1,I),  A(1,N),  A1,  N-1 )
            call Vec_Add_SSE ( A(n+1,I),  A(n+1,N),  A1,  NMAX-N )
          end if
          A(N,I) = A1*D 
        end do

The results of this change for LPX = 2000 is Original code : 28.12 seconds (using real4) Use of function call in inner loops : 13.73 seconds (using real8) Use of SSE in function : 7.79 seconds (using real8) Note: as the SSE function is real8, the code was converted to Real*8 for these changes

For Test 3 : the combination of transpose and dot_product function call results in:

        DO J2 = 1,LPM 
          WERT = 0.0 
          DO L2 = 1,LPM 
            WERT = WERT + A(J1,L2)*B(L2,J2) 
          end do  
being changed to :
        DO J2 = 1,LPM 
          WERT = Vec_Sum_SSE  ( At(1,J1),  B(1,J2),  LPM )

The results of this change for LPX = 2000 is Original code : 66.86 seconds ( real4 with /opt) Original code : 76.0 seconds ( real8 with /opt ) Use of transpose and function call in inner loops: 9.12 seconds Use of SSE in function : 4.91 seconds

For Test 4: the results were less significant, although this was the test that was questioned. The original problem of using direct declaration of ALLOCATE arrays and transfer them to a subroutine (as previously posted) did remove this problem. I did do a change to real8 and replaced A(J1,L2) by At(L2,J1) but the improvement was less effective. The combination of these effects resulted in a similar run time. The results of this change for LPX = 2000 is Original code : 40.6 seconds ( with direct ALLOCATE arrays ) Original code : 34.2 seconds ( with common or subroutine BENCH interface) Use of A transpose and Real8: 31.4 seconds I changed the inner loop as below, but did not try to devise function replacements for the inner loop when L2 > J1 and I > 0 or L2 ⇐ J1 and I < 0. I did use A transpose for L2 ⇐ J1.

           DO L2 = 1,LPM 
             I = -I 
             IF (L2 > J1) THEN 
               IF (I > 0) THEN 
                 WERT = WERT - A(L2,J1)*B(L2,J2)
                 n = n + 1
               ELSE IF (B(L2,J2) > .00001) THEN
                 WERT = WERT - A(L2,J1)/B(L2,J2) 
                 n = n + 1
               END IF 
             ELSE 
               IF (I < 0) THEN 
                 WERT = WERT + At(L2,J1)*B(L2,J2) 
                 n = n + 1
               ELSE IF (B(L2,J2) > .00001) then
                 WERT = WERT + At(L2,J1)/B(L2,J2) 
                 n = n + 1
               END IF 
             END IF 
           END DO

I hope this shows some options available for improving performance with FTN95 Tests were carried out on an Intel i5-2300 CPU @ 2.8 GHz

John

Unfortunately with this forum, I can not easily attach the different versions I tested. I could email if you provide details in a PM.

17 Jul 2014 1:52 #14342

Apologies for not responding sooner, but I wanted to provide some structured response. I have generated a number of tests, being: Alloc_test_0.for the original code copied from the forum Alloc_test_A4.for modified for ALLOCATE only; to confirm the run times Alloc_test_C4.for modified for COMMON only; to confirm the run times Alloc_test_AC4.for tests both memory models, as a subroutine call Alloc_test_AC8.for real8 tests both memory models, as a subroutine call Alloc_test_AC8t.for modified for A transpose to improve cache usage Alloc_test_AC8v.for introduce vector functions to make subscript calculation more efficient Alloc_test_AC8s.for introduce SSE instructions for real8 vector instructions Alloc_test_AC4v.for real*4 test of vector instructions

These tests were performed for a number of alternatives: Most tests use both ALLOCATE and COMMON to demonstrate this is typically not an issue. Use /opt or not to show the effect of optimisation (good for simple vector functions but less effective for complex loops, such as test 4) Different processors : Acer i5-2300 @ 2.80 GHz or Dell i5-4200U @ 1.6 GHz up to 2.6 GHz (?) The purpose of the changes were as follows. AC4 showed that memory allocation approach should not change the results. The results from _A4 with /opt must be from local changes to addressing for allocatable array that do not work in FTN95. AC8 was introduced so that SSE instructions could be tested. AC8t was to show that A transpose should improve cache usage and did work for some tests. AC8v was to show that using vector functions (especially with /opt) improves the efficiency of subscript calculation. FTN95 is not good at optimising 2d array subscripts. AC8s was to show that SSE instructions can nearly double performance, especially for memory aligned arrays. In general these results were achieved. I think each change was effective for the tests they best suit. Test 4 was the least improved test, which says that this inner loop could be restructured to improve performance. All test results are in the spreadsheet timing_summary.xlsx. This shows some very interesting results, with the use of transpose, vector and SSE changes working in some cases and not in others. If you are familiar with pivot tables, then by re-arranging the fields, you can see some interesting results. To rerun the tests, use all_tests.bat in cmd.exe window. All files are in the dropbox link and hopefully accessible. I hope these results are of use.

John

https://www.dropbox.com/s/xnawf5zvm59nmtu/Alloc_Test.zip

18 Jul 2014 2:58 #14343

Dietmar, I have modified test 4 of Alloc_test_AC8v, as test 5 where I have changed the inner loop to be a function that operates on vectors of A and B ( I also use a local copy of the active row of A to reduce storage ) The results for this test change the run time for test 4 of (no_opt=31.6, opt=33.6) to (no_opt=23.9, opt=19.2) for test 5. This now shows that /opt improves the calculation time. FTN95 does not provide good optimisation for array subscript calculations for rank > 1 arrays. This is not the case for most optimising compilers (see polyhedron tests). The changes I have provided do show a way of changing the code to improve performance with FTN95 where possible, by:

  1. Improving cache usage via transpose (applies to all compilers) and
  2. Improving optimisation of the inner loop subscripts for rank 1 vectors.
  3. The 3rd main option of using SSE (vector) instructions is more difficult but is possible. Davidb provided the real*8 vector functions for dot_product and [A] = [A] + const * [B], which are the main inner loop functions for linear equation solution. Your inner loop in test 4/5 is more complex. There are a number of the significant issues for SSE instructions to be used effectively. These are: • Alignment of the 2 vectors to suit the SSE instructions, and • Memory access speed / bandwidth to get information from memory. If the vectors are large, they can not be cached and so the memory access speed ( typically 1333 MHz, 1600 MHz or 1800 MHz ) is more critical than the CPY clock rate. I have been suggesting that a library of SSE functions for vector calculations could overcome a lot of the performance problems in FTN95, as it is typically just a few inner loops where all the performance is impacted. FTN95 would also need to improve the alignment control of vectors. I have also been experimenting with AVX instructions, but typically for large vectors that are not cached, the memory bandwidth for a single process limits any improvement. I hope this example has demonstrated the significant improvement that can be achieved from improving cacheing of inner loops and of using SSE instructions, providing comparable performance of other compilers.

John

18 Jul 2014 3:02 #14344

Listing of Test 5, which is an improvement of test 4

      subroutine test_5 ( a, b, c, LPX, sum4 )
!
!   reworked Test 4 to improve performance by simplifying array subscript calculations
!
      integer*4 LPX
      REAL*8    A(LPX,LPX), B(LPX,LPX), C(LPX,LPX)
      real*8    sum4     ! abs sum from test4 used to check result
!
      COMMON /COMIO/ KT,KL 
      integer*4 KT,KL
!
      Real*8    At(LPX)  ! use a local array At: for active row of A
      integer*4 lpm,j1,j2
      integer*8 n
      REAL*8    loop_fn, WERT,SUMF
      External  loop_fn
!
      WRITE (KT,1010) ' Multiplikation ohne Vektormglichkeit '

       SUMF = 0 
       n    = 0
       LPM  = LPX 
       DO J1 = 1,LPM 
         At = A(J1,:)
         DO J2 = 1,LPM 
!
           WERT = loop_fn ( A(1,J1), B(1,J2), At, LPM, J1 )
!
           C(J1,J2) = WERT 
           SUMF = SUMF + ABS(WERT) 
         end do
       end do
!
       WRITE (KT,1030) SUMF, (sumf-sum4), n
 1010 FORMAT(/a) 
 1030 FORMAT(' Testwert fr Teil5:',2ES15.5, 2x,i0) 
!
       CALL CPUTIM ('Test 5 : Multiplikation ohne Vektormglichkeit') 
!
      end subroutine test_5

      real*8 function loop_fn ( A, B, At, LPM, J1 )
!
!   Inner loop of test 4/5 using active columns of Matrix A, B
!   and active row of A
!
      real*8 A(*), B(*), At(*), wert
      integer*4 J1, LPM, L2
!
       wert = 0
!    L2 odd : I < 0           
       DO L2 = 1,LPM,2
         IF (L2 > J1) THEN 
           IF (B(L2) <= .00001) cycle   ! is this a suitable test, can B be -ve ?
           WERT = WERT - A(L2)/B(L2) 
         ELSE 
           WERT = WERT + At(L2)*B(L2) 
         END IF 
       END DO
!
!    L2 even : I > 0           
       DO L2 = 2,LPM,2
         IF (L2 > J1) THEN 
           WERT = WERT - A(L2)*B(L2)
         ELSE 
           IF (B(L2) <= .00001) cycle
           WERT = WERT + At(L2)/B(L2) 
         END IF 
       END DO
!
       loop_fn = wert
      end function loop_fn
Please login to reply.