CSC/ECE 506 Spring 2010/ch 2 maf: Difference between revisions
Jump to navigation
Jump to search
| Line 52: | Line 52: | ||
sum = sum + a[i]; | sum = sum + a[i]; | ||
Print sum; | Print sum; | ||
// Data parallel implementation in C++ with OpenMP. | |||
int main(void) | |||
{ | |||
double a[8], b[8], c[8], localSum[2]; | |||
#pragma omp parallel for | |||
for (int id = 0; id < 2; id++) | |||
{ | |||
int local_iter = 4; | |||
int start_iter = id * local_iter; | |||
int end_iter = start_iter + local_iter; | |||
for (int i = start_iter; i < end_iter; i++) | |||
a[i] = b[i] + c[i]; | |||
local_sum[id] = 0; | |||
for (int i = start_iter; i < end_iter; i++) | |||
if (a[i] > 0) | |||
localSum[id] = localSum[id] + a[i]; | |||
} | |||
double sum = localSum[0] + localSum[1]; | |||
cout << sum; | |||
} | |||
// Data parallel implementation in C for CUDA. | // Data parallel implementation in C for CUDA. | ||
| Line 59: | Line 85: | ||
double* b, | double* b, | ||
double* c, | double* c, | ||
double* | double* localSum) | ||
{ | { | ||
int id = threadIdx.x; | int id = threadIdx.x; | ||
| Line 71: | Line 97: | ||
local_sum[id] = 0; | local_sum[id] = 0; | ||
for (int i = start_iter; i < end_iter; i++) | for (int i = start_iter; i < end_iter; i++) | ||
if (a[i] > 0) | |||
localSum[id] = localSum[id] + a[i]; | |||
} | } | ||
int main() | int main() | ||
{ | { | ||
double a[8], b[8], c[8], | double a[8], b[8], c[8], localSum[2]; | ||
kernel<<<1, 2>>>(a, b, c, | kernel<<<1, 2>>>(a, b, c, localSum); | ||
double sum = | double sum = localSum[0] + localSum[1]; | ||
cout << sum; | cout << sum; | ||
} | } | ||
| Line 97: | Line 124: | ||
LOCAL_SUM[ID] = 0; | LOCAL_SUM[ID] = 0; | ||
DO I = START_ITER:END_ITER | DO I = START_ITER:END_ITER | ||
LOCAL_SUM[ID] = LOCAL_SUM[ID] + A[I] | IF A[I] > 0 THEN | ||
LOCAL_SUM[ID] = LOCAL_SUM[ID] + A[I] | |||
END IF | |||
END DO | END DO | ||
END FORALL | END FORALL | ||
Revision as of 17:38, 27 January 2010
Supplement to Chapter 2: The Data Parallel Programming Model
Overview
| Aspects | Shared Memory | Message Passing | Data Parallel |
|---|---|---|---|
| Communication | implicit (via loads/stores) | explicit messages | implicit |
| Synchronization | explicit | implicit (via messages) | implicit for SIMD; explicit for SPMD |
| Hardware support | typically required | none | |
| Development effort | lower | higher | higher |
| Tuning effort | higher | lower |
A Code Example
// Simple sequential code from Solihin 2008, page 25.
for (i = 0; i < 8; i++)
a[i] = b[i] + c[i];
sum = 0;
for (i = 0; i < 8; i++)
if (a[i] > 0)
sum = sum + a[i];
Print sum;
// Data parallel implementation in C++ with OpenMP.
int main(void)
{
double a[8], b[8], c[8], localSum[2];
#pragma omp parallel for
for (int id = 0; id < 2; id++)
{
int local_iter = 4;
int start_iter = id * local_iter;
int end_iter = start_iter + local_iter;
for (int i = start_iter; i < end_iter; i++)
a[i] = b[i] + c[i];
local_sum[id] = 0;
for (int i = start_iter; i < end_iter; i++)
if (a[i] > 0)
localSum[id] = localSum[id] + a[i];
}
double sum = localSum[0] + localSum[1];
cout << sum;
}
// Data parallel implementation in C for CUDA.
__global__ void kernel(
double* a,
double* b,
double* c,
double* localSum)
{
int id = threadIdx.x;
int local_iter = 4;
int start_iter = id * local_iter;
int end_iter = start_iter + local_iter;
for (int i = start_iter; i < end_iter; i++)
a[i] = b[i] + c[i];
local_sum[id] = 0;
for (int i = start_iter; i < end_iter; i++)
if (a[i] > 0)
localSum[id] = localSum[id] + a[i];
}
int main()
{
double a[8], b[8], c[8], localSum[2];
kernel<<<1, 2>>>(a, b, c, localSum);
double sum = localSum[0] + localSum[1];
cout << sum;
}
C DATA PARALLEL IMPLEMENTATION IN FORTRAN
REAL A(8), B(8), C(8), LOCAL_SUM(2), SUM
FORALL ID = 1:2
LOCAL_ITER = 4
START_ITER = (ID - 1) * LOCAL_ITER + 1
END_ITER = START_ITER + LOCAL_ITER - 1
DO I = START_ITER:END_ITER
A[I] = B[I] + C[I]
END DO
LOCAL_SUM[ID] = 0;
DO I = START_ITER:END_ITER
IF A[I] > 0 THEN
LOCAL_SUM[ID] = LOCAL_SUM[ID] + A[I]
END IF
END DO
END FORALL
SUM = LOCAL_SUM[0] + LOCAL_SUM[1]
WRITE(*,*) SUM