💾 Archived View for spam.works › mirrors › textfiles › computers › bauer.lst captured on 2023-12-28 at 17:13:37.

View Raw

More Information

⬅️ Previous capture (2023-06-14)

-=-=-=-=-=-=-

 
_OPTIMIZING IN A PARALLEL ENVIRONMENT_
by Barr E. Bauer

[LISTIN? ONE]

      program test                                                      1

      parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10)              7
      real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST)                             8
      real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST)                             9
      real*8 sub_total(MAXFIRST), partial_total(4)                      10
      real*8 d(MAXTHIRD), c, tmp    ! local variables                   11
      real*8 dist(MAXSECOND,MAXFIRST), grand_total                      12
      real*8 grand_total      ! test for proper operation               13
      logical parallel        ! selects 2-version loops                 14
      integer*4 iflag         ! used to show LASTLOCAL value            15
                                                                        16
      data parallel /.false./                                           17
      data sub_total, iflag /MAXFIRST*0.0, 0/                           18

                                                                        22

                                                                        25
      do i = 1, MAXFIRST                                                26

                                                                        30

                                                                        32
        do j = 1, MAXSECOND                                             33
          do k = 1, MAXTHIRD                                            34
            a(k,j,i) = dsqrt(dfloat(i*j*k))                             35
            c = 1.0 - a(k,j,i)                                          36
            if (c .le. 0.0 .and. i .lt. j*k) then                       37
              c = -c                                                    38
            else                                                        39
              c = c**2                                                  40
            endif                                                       41
            b(k,j,i) = 32*(dcos(c)**5)*dsin(c)-                         42
     1                 32*(dcos(c)**3)*dsin(c)+                         43
     2                  6*dcos(c)*dsin(c)                               44
          enddo                                                         45
        enddo                                                           46

                                                                        50

                                                                        53
?        do j=1, MAXSECOND                                               54
          tmp = 0.0                                                     55
          do k = 1, MAXTHIRD                                            56
            d(k) = a(k,j,i) - b(k,j,i)                                  57
          enddo                                                         58
          do k = 1, MAXTHIRD                                            59
            tmp = tmp + d(k)**2                                         60
          enddo                                                         61
          dist(j,i) = dsqrt(tmp)                                        62
          if (dist(j,i) .le. 0.1) iflag = iflag + 1                     63
          sub_total(j) = sub_total(j) + dist(j,i)                       64
        enddo                                                           65
      enddo                                                             66

                                                                        74
C$    parallel = .true.                                                 75
      grand_total = 0.0                                                 76
      if (parallel) then                     ! parallel version         77
C$      num_threads = mp_numthreads()                                   78
        ichunk = (MAXFIRST + (num_threads - 1))/num_threads             79
                                                                        80
C$doacross local(k,j),                                                  81
C{body}amp;        share(num_threads,partial_total,sub_total,ichunk)            82
                                                                        83
        do k = 1, num_threads ! this loop is parallelized               84
          partial_total(k) = 0.0                                        85
          do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST)          86
            partial_total(k) = partial_total(k) + sub_total(j)          87
          enddo                                                         88
        enddo                                                           89
        do j = 1, num_threads   ! smaller loop handled as scalar        90
          grand_total = grand_total + partial_total(j)                  91
        enddo                                                           92
      else                                   ! the scalar version       93
        do j = 1, MAXFIRST                                              94
          grand_total = grand_total + sub_total(j)                      95
        enddo                                                           96
      endif                                                             97
                                                                        98
      if (parallel) then                                                99
C$      write (*,10) grand_total, num_threads                           100
C$      write (*,20) iflag                                              101
      else                                                              102
        write (*,30) grand_total                                        103
        write (*,40) iflag                                              104
      endif                                                             105
      stop                                                              106
C$10  format(1x,'grand total = ',g10.3,'threads = ',i4)                 107
C$20  format(1x,'parallel iflag = ',i10)                                108
?30    format(1x,'grand total = ',g10.3)                                 109
40    format(1x,'scalar iflag = ',i10)                                  110
      end                                                               111



[LISTIN? TWO]


(source code)

      subroutine example(a, b, c, n)
      integer*4 n
      real*4 a(n), b(n), c(n)

      (additional code)

c$doacross local(i, x)
      do i=1, n
        x = a(n) * b(n)
        c(n) = x**2
      enddo

      (additional code)

      return
      end

(the loop is transformed to)

      subroutine _example_1(
     1  _local_start,             ! index starting value
?     2  _local_ntrip,             ! number of loop executions
     3  _incr,                    ! index increment
     4  _my_threadno)             ! unique process ID number

      integer*4 _local_start, _local_ntrip, _incr, _my_threadno

      integer*4  i                ! declared local
      real*4     x                ! declared local

      integer*4  _tmp             ! created local

      i = _local_start
      do _tmp = 1, _local_ntrip
        x = a(i) * b(i)
        c(i) = x**2
        i = i + _incr
      enddo
      return
      end


Exampl? 1? A typical D? loop

 
do i = 1, n
   a(i) = x * b(i)
enddo


Exampl? 2? ? D? loo? i? whic? th? arra? variabl? reference? ? ?
valu? tha? i? no? curren? wit? ?h? index
 
do i = 2, n
   arr(i) = b(i) - arr(i-1)
enddo

Exampl? 3? A? exampl? o? loa? imbalance
 
      do i = 1, n
          do j = 1, i
               a(j, i) = a(j, i) * xmult
          enddo
      enddo


Exampl? 4? Loa? balancing

      num_threads = mp_numthreads()
c$doacross local(i, j, k)
      do k = 1, num_threads
          do i = k, n, num_threads
               do j = 1, i
                    a(j, i) = a(j, i) * xmult
               enddo
          enddo
      enddo