(py)OpenCL na kartach graficznych:
Wprowadzeniedo GPGPU
kolodziejj.info
● wyszukiwanie obiektów na zdjęciu● CPU-bound● 32 MPix
● implementacje:● Matlab: 6 godzin● Python + OpenCL: 1 minuta
● Tak, słownie: jedna minuta. 360 x szybciej.
Agenda● zastosowanie● co muszę umieć?● rys historyczny● sprzęt● terminologia● omówienie przykładowego kodu● co dalej?
GPGPU
General-Purposecomputing on GPU
Zastosowanie● strumieniowe przetwarzanie obrazów dużych tablic w podobny sposób
● obrazy, video● kryptografia● fizyka (od astrofizyki do fizyki kwantowej)
● biologia, medycyna● bazy danych● ...
Co przyda się z C● funkcje● podstawowe typy danych ([unsigned] integer, float, double)
● tablice● wskaźniki● umiejętność rozrzucania wszędzie tych dziwnych znaczków { } ;
Historia● programowalne shadery, obsługa floatów● programowanie via OpenGL, DirectX
● pierwsze dedykowane platformy● Sh/RapidMind, Brook, Accelerator
Platformy GPGPU● nVidia CUDA
● i nadal mają się nieźle● MicroSoft's F# + DirectCompute● AMD's FireStream● C++ AMP● OpenACC● …
Tylko takie GPU?
@ CSIRO
Tylko takie GPU?
@ nVidia
@ nVidia
@ benchmark.pl
„Zrównoleglalny” przykład
Dodawanie wektorów!
„Zrównoleglalny” przykład
A[0]
A[1]
A[2]
A[n-1]
B[0]
B[1]
B[2]
B[n-1]
+ =
C[0]
C[1]
C[2]
C[n-1]
= A[0] + B[0]
= A[1] + B[1]
= A[2] + B[2]
= A[n-1] + B[n-1]
„Zrównoleglalny” przykład
void add(float * a, float * b, unsigned int n, float * c) { for (int i = 0; i < n; ++i) { c[i] = a[i] + b[i]; }}
CPU, szeregowo
CPU opsN ops
„Zrównoleglalny” przykład
A[0]
A[1]
A[2]
A[n-1]
B[0]
B[1]
B[2]
B[n-1]
+ =
C[0]
C[1]
C[2]
C[n-1]
= A[0] + B[0]
= A[1] + B[1]
= A[2] + B[2]
= A[n-1] + B[n-1]
„Zrównoleglalny” przykład
A[i] B[i]+ = C[i] = A[i] + B[i]
CPU, równolegle
CPU opsN ops / 4
Trochę liczbDodawanie:● N = 24, jeden wątek – 24 kroki● N = 24, 4 wątki – 6 kroków● N = 220, 4 wątki – 218 kroków● potrzebujemy więcej wątków!
GPGPU● massive parallelism - setki-tysiące wątków na raz
GPGPUOptymalizowanie:
● maksymalnego wykorzystania jednostek obliczeniowych w GPU
● vs. maksymalną przepustowość przesyłania danych
GPGPUMetryka:
● „przepustowość” obliczeń [MB/s]
OpenCL● kernele● work items● work groups● model pamięci
Kernele
z grubsza funkcje w Cz keyword'em __global
__global void add(__global const float * a, __global const float * b, const unsigned int n, __global float * c) { int gid = get_global_id(0);
if (gid < n) { c[gid] = a[gid] + b[gid]; }}
Kernele
z grubsza funkcje w Cz keyword'em __global
__global void add(__global const float * a, __global const float * b, const unsigned int n, __global float * c) { int gid = get_global_id(0);
if (gid < n) { c[gid] = a[gid] + b[gid]; }}
Work items● „instancje” kerneli● bardzo ograniczona prywatna pamięć (rzędu KB)
● powinny żyć bardzo krótko● ale może ich być bardzo dużo
A[0]
A[1]
A[2]
A[n-1]
B[0]
B[1]
B[2]
B[n-1]
+ =
C[0]
C[1]
C[2]
C[n-1]
= A[0] + B[0]
= A[1] + B[1]
= A[2] + B[2]
= A[n-1] + B[n-1]
Work items
C[0]
C[1]
C[2]
C[n-1]
= A[0] + B[0]
= A[1] + B[1]
= A[2] + B[2]
= A[n-1] + B[n-1]
Work items
C[2] = A[2] + B[2]
Work items
Work groups
● grupy work items● wielkość grupy – local work size● maksymalna wielkość zależy od urządzenia
● pamięć lokalna
C[0]
C[1]
C[2]
C[n-1]
= A[0] + B[0]
= A[1] + B[1]
= A[2] + B[2]
= A[n-1] + B[n-1]
Work groups
C[0]
C[1]
C[2]
C[n-1]
Work groups
Work groups
Work groups
local work size = 1024N = 3 * 1024 = 3072
Model pamięci
Model pamięci
device
host
r i v a t e m e mp
work group 1 work group 2 work group n
local memory local memorylocal memory
host memory
global memory
Czas na kod!
__global void add(__global const float * a, __global const float * b, const unsigned int n, __global float * c) { int gid = get_global_id(0);
if (gid < n) { c[gid] = a[gid] + b[gid]; }}
#include <stdio.h>#include <stdlib.h>#include <time.h>
#include <CL/cl.h>
#define ARRAY_SIZE 4096#define MAX_SOURCE_SIZE (0x100000)
int main(void) { const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
float h_a[ARRAY_SIZE]; float h_b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { h_a[i] = (float)i; h_b[i] = (float)(2 * i); }
float h_c[ARRAY_SIZE];
FILE *fp; char *source_str; size_t source_size;
fp = fopen("vectors_cl.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp);
cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_BYTES, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_a, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_b, 0, NULL, NULL);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != 0) { printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret); exit(1); }
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != 0) { printf("clBuildProgram returned non-zero status %d: ", ret);
if (ret == CL_INVALID_PROGRAM) { printf("invalid program\n"); } else if (ret == CL_INVALID_VALUE) { printf("invalid value\n"); } else if (ret == CL_INVALID_DEVICE) { printf("invalid device\n"); } else if (ret == CL_INVALID_BINARY) { printf("invalid binary\n"); } else if (ret == CL_INVALID_BUILD_OPTIONS) { printf("invalid build options\n"); } else if (ret == CL_INVALID_OPERATION) { printf("invalid operation\n"); } else if (ret == CL_COMPILER_NOT_AVAILABLE) { printf("compiler not available\n"); } else if (ret == CL_BUILD_PROGRAM_FAILURE) { printf("build program failure\n");
size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log); } else if (ret == CL_OUT_OF_HOST_MEMORY) { printf("out of host memory\n"); } exit(1); }
cl_kernel kernel = clCreateKernel(program, "add", &ret);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); size_t array_size = ARRAY_SIZE; ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size);
size_t global_item_size = ARRAY_SIZE; // Process the entire lists size_t local_item_size = 1; // Divide work items into groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_c, 0, NULL, NULL);
ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context);
return 0;}
Jak to odpalić?
● Instalacja OpenCL + ICD● ICD – Installable Client Driver
Jak to odpalić?
$ gcc -std=c99 vectors_cl.c -o vectors_cl -l OpenCL$ ./vectors_cl
● kompilacja, uruchomienie:
pyopencl
Czas na ładny kod!
__global void add(__global const float * a, __global const float * b, const unsigned int n, __global float * c) { int gid = get_global_id(0);
if (gid < n) { c[gid] = a[gid] + b[gid]; }}
import numpyimport osimport pyopencl
def add(a, b): # Create context. context = pyopencl.create_some_context()
# Create command queue withing it. queue = pyopencl.CommandQueue(context)
# Build the "program". program = pyopencl.Program( context, open(os.path.join( os.path.dirname(os.path.abspath(__file__)), "vectors_cl.cl") ).read() ).build()
# Create two readable buffers on the device memory and # copy the input data there. a_in = pyopencl.Buffer( context, pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf=a) b_in = pyopencl.Buffer( context, pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf=b)
# Create one writeable buffer on the device memory for # result. c_out = pyopencl.Buffer( context, pyopencl.mem_flags.WRITE_ONLY, a.nbytes # Size. )
# Execute the kernel. program.add(queue, a.shape, None, a_in, b_in, numpy.uint32(ARRAY_SIZE), c_out)
# Create empty numpy array on the host for result. c = numpy.empty_like(a)
# Copy the result from the device to the host. pyopencl.enqueue_copy(queue, c, c_out)
return c
# Execute the kernel. program.add(queue, a.shape, None, a_in, b_in, numpy.uint32(ARRAY_SIZE), c_out)
# Create empty numpy array on the host for result. c = numpy.empty_like(a)
# Copy the result from the device to the host. pyopencl.enqueue_copy(queue, c, c_out)
return c
__global void add(__global const float * a, __global const float * b, const unsigned int n, __global float * c) { int gid = get_global_id(0);
if (gid < n) { c[gid] = a[gid] + b[gid]; }}
ARRAY_SIZE = 4096
def test_add(): # Generate the input array on the host. a = numpy.empty(ARRAY_SIZE, dtype=numpy.float32) b = numpy.empty(ARRAY_SIZE, dtype=numpy.float32)
for i in range(ARRAY_SIZE): a[i] = i b[i] = 2 * i
c = add(a, b)
assert c[0] == 0 assert c[1] == 3 assert c[-2] == 12282 assert c[-1] == 12285
$ py.test
examples/test_vectors_cl.py .
================= 1 passed in 0.21 seconds =================
Więcej o work itemsget_global_id(0);
Więcej o work itemsget_global_id(0);
get_global_id(1);
Więcej o work items
get_global_id(0);
get_global_id(1);
get_global_id(2);
Czas na ładny kod!
__global void add(__global const float * in, const int width, const int height, __global float * c) { int x = get_global_id(0); int y = get_global_id(1);
int gid = y * width + x;
if (x < width && y < height) { // }}
Optymalizacja● rozmiar work group● sekwencyjny dostęp do pamięci● uruchamianie wielu kerneli naraz● ...
Wzorce● mapa● redukcja● scan● histogram● scatter● gather● sort
Wzorce● pyopencl.elementwise● pyopencl.reduction● pyopencl.scan● pyopencl.algorithm● pyopencl.bitonic_sort
from pyopencl import ...● Image● array● clmath● clrandom● characterize● tools●
Wsparcie OpenCL● OpenCV● ClBLAS, ViennaCL, clFFT● Rivertrail, WebCL● ViNN● Go, Haskell, Lua, Rust, Java…● PgOpenCL
Randomowe uwagi● typy danych
● ile bajtów jest we floacie?● zawsze ustawiać dtype w numpy.array
● wielkość danych● jak zwykle, testy :)
Dzięki :) Pytania?
kolodziejj.info/talks/gpgpu/