CSC/ECE 506 Spring 2010/ch 2 maf: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
|||
| Line 43: | Line 43: | ||
===A Code Example=== | ===A Code Example=== | ||
// ''Simple sequential code from Solihin 2008, page 25.'' | |||
for (i = 0; i < 8; i++) | |||
a[i] = b[i] + c[i]; | |||
sum = 0; | |||
for (i = 0; i < 8; i++) | |||
if (a[i] > 0) | |||
sum = sum + a[i]; | |||
Print sum; | |||
// ''Data parallel implementation in C for CUDA.'' | |||
__global__ void kernel( | __global__ void kernel( | ||
double* a, | double* a, | ||
Revision as of 06:13, 27 January 2010
Supplement to Chapter 2: The Data Parallel Programming Model
Overview
| Aspects | Shared Memory | Message Passing | Data Parallel |
|---|---|---|---|
| Communication | implicit (via loads/stores) | explicit messages | implicit |
| Synchronization | explicit | implicit (via messages) | implicit for SIMD; explicit for SPMD |
| Hardware support | typically required | none | |
| Development effort | lower | higher | higher |
| Tuning effort | higher | lower |
A Code Example
// Simple sequential code from Solihin 2008, page 25.
for (i = 0; i < 8; i++)
a[i] = b[i] + c[i];
sum = 0;
for (i = 0; i < 8; i++)
if (a[i] > 0)
sum = sum + a[i];
Print sum;
// Data parallel implementation in C for CUDA.
__global__ void kernel(
double* a,
double* b,
double* c,
double* local_sum)
{
id = threadIdx.x;
local_iter = 4;
start_iter = id * local_iter;
end_iter = start_iter + local_iter;
for (i = start_iter; i < end_iter; i++)
a[i] = b[i] + c[i];
local_sum[id] = 0;
for (i = start_iter; i < end_iter; i++)
local_sum[id] = local_sum[id] + a[i];
}
int main()
{
double a[8], b[8], c[8], local_sum[2];
kernel<<<1, 2>>>(a, b, c, local_sum);
sum = local_sum[0] + local_sum[1];
cout << sum;
}