|
2 | 2 | #:include 'omp_macros.fpp' |
3 | 3 | #:include 'acc_macros.fpp' |
4 | 4 |
|
| 5 | +! GPU parallel region (scalar reductions, maxval/minval) |
5 | 6 | #:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, & |
6 | 7 | & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & |
7 | 8 | & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None, extraOmpArgs=None) |
|
20 | 21 | #endif |
21 | 22 | #:enddef |
22 | 23 |
|
| 24 | +! GPU parallel loop over threads (most common GPU macro) |
23 | 25 | #:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', & |
24 | 26 | & default='present', firstprivate=None, reduction=None, reductionOp=None, & |
25 | 27 | & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & |
|
39 | 41 | #endif |
40 | 42 | #:enddef |
41 | 43 |
|
| 44 | +! Required closing for GPU_PARALLEL_LOOP |
42 | 45 | #:def END_GPU_PARALLEL_LOOP() |
43 | 46 | #:set acc_end_directive = '!$acc end parallel loop' |
44 | 47 | #:set omp_end_directive = END_OMP_PARALLEL_LOOP() |
|
50 | 53 | #endif |
51 | 54 | #:enddef |
52 | 55 |
|
| 56 | +! Mark routine for device compilation |
53 | 57 | #:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, cray_noinline=False, extraAccArgs=None, & |
54 | 58 | & extraOmpArgs=None) |
55 | 59 | #:assert isinstance(cray_inline, bool) |
|
106 | 110 | #:endif |
107 | 111 | #:enddef |
108 | 112 |
|
| 113 | +! Declare device-resident data |
109 | 114 | #:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, & |
110 | 115 | & link=None, extraAccArgs=None, extraOmpArgs=None) |
111 | 116 | #:set acc_code = ACC_DECLARE(copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, & |
|
123 | 128 | #endif |
124 | 129 | #:enddef |
125 | 130 |
|
| 131 | +! Inner loop within a GPU parallel region |
126 | 132 | #:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, & |
127 | 133 | & extraAccArgs=None, extraOmpArgs=None) |
128 | 134 | #:set acc_code = ACC_LOOP(collapse=collapse, parallelism=parallelism, data_dependency=data_dependency, reduction=reduction, & |
|
137 | 143 | #endif |
138 | 144 | #:enddef |
139 | 145 |
|
| 146 | +! Scoped GPU data region |
140 | 147 | #:def GPU_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, & |
141 | 148 | & deviceptr=None, attach=None, default=None, extraAccArgs=None, extraOmpArgs=None) |
142 | 149 | #:set acc_code = ACC_DATA(code=code, copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, & |
|
155 | 162 | #endif |
156 | 163 | #:enddef |
157 | 164 |
|
| 165 | +! Host code with device pointers (for MPI with GPU buffers) |
158 | 166 | #:def GPU_HOST_DATA(code, use_device_addr=None, use_device_ptr=None, extraAccArgs=None, extraOmpArgs=None) |
159 | 167 | #:if use_device_addr is not None and use_device_ptr is not None |
160 | 168 | #:set use_device_addr_end_index = len(use_device_addr) - 1 |
|
183 | 191 | #endif |
184 | 192 | #:enddef |
185 | 193 |
|
| 194 | +! Allocate device memory (unscoped) |
186 | 195 | #:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None, extraOmpArgs=None) |
187 | 196 | #:set acc_code = ACC_ENTER_DATA(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, attach=attach, & |
188 | 197 | & extraAccArgs=extraAccArgs) |
|
196 | 205 | #endif |
197 | 206 | #:enddef |
198 | 207 |
|
| 208 | +! Free device memory |
199 | 209 | #:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None, extraOmpArgs=None) |
200 | 210 | #:set acc_code = ACC_EXIT_DATA(copyout=copyout, delete=delete, detach=detach, extraAccArgs=extraAccArgs) |
201 | 211 | #:set omp_code = OMP_EXIT_DATA(copyout=copyout, delete=delete, detach=detach, extraOmpArgs=extraOmpArgs) |
|
207 | 217 | #endif |
208 | 218 | #:enddef |
209 | 219 |
|
| 220 | +! Atomic operation on device |
210 | 221 | #:def GPU_ATOMIC(atomic, extraAccArgs=None, extraOmpArgs=None) |
211 | 222 | #:set acc_code = ACC_ATOMIC(atomic=atomic, extraAccArgs=extraAccArgs) |
212 | 223 | #:set omp_code = OMP_ATOMIC(atomic=atomic, extraOmpArgs=extraOmpArgs) |
|
218 | 229 | #endif |
219 | 230 | #:enddef |
220 | 231 |
|
| 232 | +! End atomic capture block |
221 | 233 | #:def END_GPU_ATOMIC_CAPTURE() |
222 | 234 | #:set acc_end_directive = '!$acc end atomic' |
223 | 235 | #:set omp_end_directive = '!$omp end atomic' |
|
228 | 240 | #endif |
229 | 241 | #:enddef |
230 | 242 |
|
| 243 | +! Copy data between host and device |
231 | 244 | #:def GPU_UPDATE(host=None, device=None, extraAccArgs=None, extraOmpArgs=None) |
232 | 245 | #:set acc_code = ACC_UPDATE(host=host, device=device, extraAccArgs=extraAccArgs) |
233 | 246 | #:set omp_code = OMP_UPDATE(host=host, device=device, extraOmpArgs=extraOmpArgs) |
|
239 | 252 | #endif |
240 | 253 | #:enddef |
241 | 254 |
|
| 255 | +! Synchronization barrier |
242 | 256 | #:def GPU_WAIT(extraAccArgs=None, extraOmpArgs=None) |
243 | 257 | #:set acc_code = ACC_WAIT(extraAccArgs=extraAccArgs) |
244 | 258 | #:set omp_code = OMP_WAIT(extraOmpArgs=extraOmpArgs) |
|
250 | 264 | #endif |
251 | 265 | #:enddef |
252 | 266 |
|
| 267 | +! Import GPU library module (openacc or omp_lib) |
253 | 268 | #:def USE_GPU_MODULE() |
254 | 269 | #if defined(MFC_OpenACC) |
255 | 270 | use openacc |
|
258 | 273 | #endif |
259 | 274 | #:enddef |
260 | 275 |
|
| 276 | +! Emit code only for AMD compiler |
261 | 277 | #:def DEF_AMD(code) |
262 | 278 | #:if MFC_COMPILER == AMD_COMPILER_ID |
263 | 279 | $:code |
264 | 280 | #:endif |
265 | 281 | #:enddef |
266 | 282 |
|
| 283 | +! Emit code for non-Cray compilers |
267 | 284 | #:def UNDEF_CCE(code) |
268 | 285 | #:if MFC_COMPILER != CCE_COMPILER_ID |
269 | 286 | $:code |
270 | 287 | #:endif |
271 | 288 | #:enddef |
272 | 289 |
|
| 290 | +! Emit code only for Cray compiler |
273 | 291 | #:def DEF_CCE(code) |
274 | 292 | #:if MFC_COMPILER == CCE_COMPILER_ID |
275 | 293 | $:code |
276 | 294 | #:endif |
277 | 295 | #:enddef |
278 | 296 |
|
| 297 | +! Emit code for non-NVIDIA compilers |
279 | 298 | #:def UNDEF_NVIDIA(code) |
280 | 299 | #:if MFC_COMPILER != NVIDIA_COMPILER_ID and MFC_COMPILER != PGI_COMPILER_ID |
281 | 300 | $:code |
|
0 commit comments