gpgpu seminar (gpu accelerated libraries, 3 of 3, thrust)

GPGPU講習会GPU 適化ライブラリの利用（その３）

長岡技術科学大学電気電子情報工学専攻出川智啓

本講習会の目標

GPGPU先端シミュレーションシステムの使用方法の習得

GPUの活用方法の修得

CUDAプログラミング技法の修得

並列計算手法の修得

2015/10/28GPGPU講習会2

本日の内容

GPU 適化ライブラリの利用（その３）

thrustの紹介

thrustによる共役勾配法実装の改良（可搬性の向上）

連立一次方程式を解くプログラムの作成

ライブラリを利用

関数(およびCUDA API)の呼出のみで作成

3回に分けて徐々に効率化

今回はThrustライブラリを利用して実装の効率化を図る

併せてCPUコードとの共通性や可搬性を向上する


GPU 適化ライブラリ

ライブラリ


特定の処理を行う複数のプログラムを再利用可能な形でまとめた集合体

動画像処理やファイル圧縮，数値計算などが有名

自作のプログラムよりも性能が高いため，関数を置き換えるだけで処理速度の向上に貢献

数値計算ライブラリ


FFT（Fast Fourier Transform） FFTW

線形代数演算（ベクトル処理，行列処理）

BLAS（Basic Linear Algebra Subprogram）

BLASを利用した線形代数演算ライブラリ LAPACK LINPACK ScaLAPACK

BLASやLAPACKのメーカー別実装 MKL Intel Math Kernel Library ACML AMD Core Math Library IMSL International Mathematics and Statistics Library

CUDA付属のライブラリ


cuBLAS 密行列向け線形代数演算

cuSPARSE 疎行列向け線形代数演算

cuFFT フーリエ変換

cuRAND 乱数生成

Thrust ソート，縮約，スキャン等

NPP 画像処理，信号処理

など

NVIDIAホームページに一覧がある

https://developer.nvidia.com/gpu‐accelerated‐libraries

その他GPU向けライブラリ


cuDNN https://developer.nvidia.com/cudnn

Deep Neural Network用のライブラリ

機械学習用のフレームワークをサポート

Caffe Theano Torch

cuDNNを使ったDIGITSというシステムを利用してNeural Networkのトレーニングを行うことが可能



MAGMA http://icl.cs.utk.edu/magma/

NVIDIA GPU向けの線形代数ライブラリ

CPUも同時に使うハイブリッド型のライブラリであるため，GPU単体より高速

BLAS, LAPACKに準ずるような形で関数形が定められている

cuBLASに取り込まれている関数もある

ソースコードが配布されており，無料で入手できる



cuBLAS‐XT https://developer.nvidia.com/cublasxt

cuBLASライブラリのマルチGPU向け実装

CUDA 6.0, 6.5から利用可能

共役勾配法

共役勾配法


連立一次方程式を解くためのアルゴリズム

係数行列が対称・正定値である連立一次方程式が対象

Hestenes and Stiefel(1952)によって提案

反復解法の性質を持ちながら，直接解法のように有限回の計算で解が得られる

「世紀の大解法」ともてはやされた

丸め誤差に弱く，有限回の計算で終わらないこともある

Hestenes, Magnus R., Stiefel, Eduard (December, 1952). "Methods of Conjugate Gradients for Solving Linear Systems". Journal of Research of the National Bureau of Standards 49 (6).

連立一次方程式の解法


直接法

係数行列を単位行列（や上三角，下三角行列）に変形することで未知数を求める方法

所定の計算回数で解が得られる

計算量が多く，大規模な問題には適用が難しい

反復法

係数行列を変更せず，未知数に推定値を代入して所定の計算を行い，推定値が解に十分近づくまで計算を繰り返す方法

よい推定値を選べば非常に高速に解が得られる

共役勾配法のアルゴリズム


連立一次方程式Ax=bに対する共役勾配法

Ap 係数行列Aとベクトルpの積

( , ) ベクトル同士の内積

Compute r(0)=b−Ax(0). Set p(0)=0,c2(0)=0.

For k=1,…, until ||r||/||b|| < , Do

p(k) = r(k)+c2(k−1)p(k−1)

c1(k) = (r(k), r(k))/(p(k), Ap(k))

x(k+1) = x(k)+c1(k)p(k)

r(k+1) = r(k)−c1(k)Ap(k)

c2(k) = (r(k+1), r(k+1))/{c1

(k)(p(k), Ap(k))}

EndDo

A 係数行列x 解ベクトルb 右辺ベクトルr 残差ベクトルp 補助ベクトル||・|| l2−ノルム

共役勾配法のバリエーション


自乗共役勾配法（CGS法）

非対称行列に対応

Compute r(0)=b−Ax(0). Set p(0)=0,c2(0)=0, r*=r(0).

For k=1,…, until ||r||/||b|| < , Dop(k) = r(k)+c2

(k−1)z(k−1)

u(k) = p(k)+c2(k−1)(z(k−1)+c2

(k−1)u(k−1))c1

(k) = (r*, r(k))/(r*, Au(k))z(k) = p(k)−c1

(k)Au(k)

x(k+1) = x(k)+c1(k)(p(k)+z(k))

r(k+1) = r(k)−c1(k)A(p(k)+z(k))

c2(k) = (r*, r(k+1))/{c1

(k)(r*, Au(k))}EndDo

r* 疑似残差u 補助ベクトルz 補助ベクトル

共役勾配法のバリエーション


安定化双共役勾配法（Bi‐CGSTAB法）

非対称行列に対応

Compute r(0)=b−Ax(0). Set p(0)=0,c2(0)=0, r*=r(0).

For k=1,…, until ||r||/||b|| < , Dop(k) = r(k)+c2

(k−1)(p(k−1)−c3(k−1)Ap(k−1))

c1(k) = (r*, r(k))/(r*, Ap(k))t(k) = r(k)−c1

(k)Ap(k)

c3(k) = (At(k), t(k))/(At(k), At(k))x(k+1) = x(k)+c1

(k)p(k)+c3(k)t(k)

r(k+1) = r(k)−c3(k)At(k)

c2(k) = (r*, r(k+1))/{c3

(k)(r*, Ap(k))}EndDo

r* 疑似残差t 補助ベクトル

連立一次方程式


3重対角行列

2次元Poisson方程式から導かれる係数行列を簡略化

解(x)が0, 1, 2・・・N−1となるようbを設定

N

N

N

N

bb

bb

xx

xx

1

2

1

1

2

1

41141

14114

0

0

CPUプログラム（制御部分）


#include<stdlib.h>#include<stdio.h>#include<math.h>int main(void){

int N = 1 << 10; //未知数の数210const double err_tol = 1e‐9; //許容誤差const int max_ite = 1<<20;//反復回数の上限

double *x; //近似解ベクトルdouble *b; //右辺ベクトルdouble *A; //係数行列double *sol; //厳密解double *r, rr; //残差ベクトル, 残差の内積double *p, *Ax; //補助ベクトル，行列ベクトル積double c1, c2, dot; //計算に使う係数int i, k;//メモリの確保A = (double *)malloc(sizeof(double)*N*N);x = (double *)malloc(sizeof(double)*N);b = (double *)malloc(sizeof(double)*N);sol= (double *)malloc(sizeof(double)*N);r = (double *)malloc(sizeof(double)*N);p = (double *)malloc(sizeof(double)*N);

Ax = (double *)malloc(sizeof(double)*N);

for (i = 0; i < N; i++){sol[i] = (double)i; //厳密解を設定

x[i] = 0.0; //近似解を0で初期化}//係数行列Aの生成setTridiagonalMatrix(A, N);//右辺ベクトルbの生成setRightHandSideVector(b, A, sol, N);// :// ここで共役勾配法を実行// ://確保したメモリを解放free(x);free(b);free(A);free(sol);free(r);free(p);free(Ax);

} cg_cpu.c

CPUプログラム（係数行列の生成）


void setTridiagonalMatrix(double *A, int N){int i,j;

for (j = 0; j < N; j++){for (i = 0; i < N; i++){

A[i+N*j] = 0.0;}

}

i = 0;A[i + N*i ] = ‐4.0;A[i + N*i+1] = 1.0;

for(i = 1; i < N‐1; i++){A[i + N*i‐1] = 1.0;A[i + N*i ] = ‐4.0;A[i + N*i+1] = 1.0;

}i = N‐1;

A[i + N*i‐1] = 1.0;A[i + N*i ] = ‐4.0;

}

41141

14114

cg_cpu.c

i

j

CPUプログラム（右辺ベクトルの生成）


void setRightHandSideVector(double *b, double *A, double *x, int N){

int i,j;

for (i = 0; i < N; i++){b[i] = 0.0;for (j = 0; j < N; j++){ //係数行列と厳密解ベクトルを用いて行列－ベクトル積を計算し，

b[i] += A[i + N*j] * x[j]; //結果を右辺ベクトルに代入}

}

}

cg_cpu.c

CPUプログラム（共役勾配法部分）


//残差ベクトルの計算 r(0)=b−Ax(0)

computeResidual(r, b, A, x, N);//残差ベクトルの内積を計算rr = innerProduct(r, r, N);

k = 1;while(rr>err_tol*err_tol && k<=max_ite){

if (k == 1){//p(k) = r(k)+c2

(k−1)p(k−1) c2とpが0のためp(k) = r(k)copy(p, r, N);

}else{

c2 = rr / (c1*dot);//p(k) = r(k)+c2

(k−1)p(k−1)

computeVectorAdd(p, c2, r, 1.0, N);}

//(p(k), Ap(k))を計算//行列ベクトル積Apを実行し，結果とpの内積computeMxV(Ax, A, p, N);dot = innerProduct(p, Ax, N);c1 = rr / dot;

//x(k+1) = x(k)+c1(k)p(k)

//r(k+1) = r(k)−c1(k)Ap(k)

computeVectorAdd(x, 1.0, p, c1, N);computeVectorAdd(r, 1.0, Ax,‐c1, N);

//残差ベクトルの内積を計算rr = innerProduct(r, r, N);

k++;}

/*Compute r(0)=b−Ax(0). Set p(0)=0,c2

(0)=0.For k=1,…, until ||r||/||b|| < , Do

p(k) = r(k)+c2(k−1)p(k−1)

c1(k) = (r(k), r(k))/(p(k), Ap(k))

x(k+1) = x(k)+c1(k)p(k)

r(k+1) = r(k)−c1(k)Ap(k)

c2(k) = (r(k+1), r(k+1))/{c1

(k)(p(k), Ap(k))}EndDo*/ cg_cpu.c

CPUプログラム（共役勾配法内の関数）


//残差ベクトルr(0)=b−Ax(0)の計算void computeResidual(double *r, double *b,

double *A, double *x,int N){

int i,j;double Ax;for (i = 0; i < N; i++){

Ax = 0.0;for (j = 0; j < N; j++){

Ax += A[i + N*j] * x[j];}r[i] = b[i]‐Ax;

}

}

//内積の計算double innerProduct(double *vec1,

double *vec2, int N){

int i;double dot;

dot=0.0;for (i = 0; i < N; i++){

dot += vec1[i]*vec2[i];}

return dot;

}

cg_cpu.c

CPUプログラム（共役勾配法内の関数）


//値のコピー（単純代入）p(k) = r(k)void copy(double *lhs, double *rhs, int N){

int i;for (i = 0; i < N; i++){

lhs[i] = rhs[i];}

}

//ベクトル和y(k) = ax(k) + by(k)の計算void computeVectorAdd(

double *y, const double b, double *x, const double a,

int N){

int i;for (i = 0; i < N; i++){

y[i] = a*x[i] + b*y[i];}

}

//行列－ベクトル積Axの計算void computeMxV(double *Ax,

double *A, double *x, int N){

int i,j;for (i = 0; i < N; i++){

Ax[i] = 0.0;for (j = 0; j < N; j++){

Ax[i] += A[i + N*j] * x[j];}

}

}

cg_cpu.c

実行結果（N=210）


iteration = 1, residual = 2.098826e+003iteration = 2, residual = 4.308518e+002iteration = 3, residual = 1.102497e+002:

iteration = 21, residual = 5.547180e‐009iteration = 22, residual = 1.486358e‐009iteration = 23, residual = 3.982675e‐010x[0] = ‐0.000000x[1] = 1.000000x[2] = 2.000000:

x[1021] = 1021.000000x[1022] = 1022.000000x[1023] = 1023.000000Total amount of Absolute Error = 5.009568e‐010

収束履歴


反復回数

残差

Thrustの紹介

Thrust


CUDA用の並列アルゴリズムライブラリ

C++の標準テンプレートライブラリ(STL)とよく似た高水準なインタフェースを保有

CUDA4.0からCUDA本体に吸収

https://developer.nvidia.com/thrust

Thrust


データを保持・管理するためのコンテナ

C++のvectorに類似

便利な1次元配列（のようなもの）

コンテナに対する並列処理アルゴリズム

コンテナの各要素に対する操作

演算などの具体的な操作を指定

並び替え

縮約

コンテナ


thrust::host_vector<T> ホスト（CPU）メモリに確保されるvector

thrust::device_vector<T> デバイス（GPU）メモリに確保されるvector

#include<thrust/host_vector.h>#include<thrust/device_vector.h>int main(){

thrust::host_vector<int> h_vec(3); //3要素からなるvectorをホストメモリに確保//templateを利用して型を決定

h_vec[0] = 10; h_vec[1] = 20; h_vec[2] = 30;//配列のように各要素にアクセス

thrust::device_vector<int> d_vec = h_vec; //vectorをGPUへコピー

return 0; //vectorは自動的に解放される（free()やcudaFree()を呼ぶ必要がない）}

確保される場所が異なるだけで取り扱い方法は同じ

型を指定

template


コンパイル時にコードを生成する機能

テンプレート仮引数(パラメータ)を利用して処理を記述

テンプレート実引数の情報からコードを生成（実体化）

C言語の関数形式マクロの安全かつ高機能版

template<typename T>T add(T a, T b){

return a + b;}

int main(void){int ia=1,ib=2;float fa=1.0f,fb=2.0f;add<int>(ia,ib); //typename Tが全てintになるadd<float>(fa,fb); //typename Tが全てfloatになるreturn 0;

}

イテレータ（iterator, 反復子）


Thrustの並列処理の書き方

forループのように個々の要素に対する処理は書かない

「vectorのここからここまでにこの処理を適用する」という書き方をする

#include<thrust/host_vector.h>

int main(){

thrust::host_vector<int> vec1(3);thrust::host_vector<int> vec2(3);

vec1[0] = 10; vec1[1] = 20; vec1[2] = 30;//ここからここまで vec1をコピー（コピー先はvec2の初から）

thrust::copy(vec1.begin(), vec1.end(), vec2.begin());

return 0;}



イテレータ

vec1.begin() vectorの初の要素を返す

vec1.end() vectorの後の要素の一つ後ろを返す

10 20 30vec1(3)

vec1[0] vec1[1] vec1[2]

vec1.begin() vec1.end()



ポインタを使った処理に類似

#include<stdio.h>int main(void){

int vec1[3], vec2[3];

vec1[0]=10; vec1[1]=20; vec1[2]=30;vec2[0]= 0; vec2[1]= 0; vec2[2]= 0;

for(int i=0;i<3;i++) //標準的なC言語的書き方vec2[i] = vec1[i]; //ループと配列，配列添字を利用

for(int i=0;i<3;i++)printf("%d¥n",vec2[i]);

return 0;}



ポインタを使った処理に類似

#include<stdio.h>int main(void){

int vec1[3], vec2[3];

vec1[0]=10; vec1[1]=20; vec1[2]=30;vec2[0]= 0; vec2[1]= 0; vec2[2]= 0;

int *src = vec1, *end = src+3; //vec1の先頭要素，終端要素+1を設定int *dst = vec2; //コピー先の先頭要素を設定while(src != end) //ポインタに対する演算を使った書き方

*(dst++) = *(src++); //


return 0;}

vectorと配列の相互利用


vectorからポインタ（配列）へ変換

thrust::raw_pointer_cast()を利用してvectorの0番目の要素のアドレスをポインタ変数に代入

#include<stdio.h>#include<thrust/host_vector.h>int main(void){

thrust::host_vector<int> vec1(3);int *ptr_vec;

vec1[0]=10; vec1[1]=20; vec1[2]=30;

ptr_vec = thrust::raw_pointer_cast(&vec1[0]); //vec1[0]のアドレスを取り出して//ポインタ変数に代入

for(int i=0;i<3;i++)printf("%d¥n",ptr_vec[i]); //ptr_vecとvec1は同じデータにアクセス

return 0;}



ポインタ（配列）をvectorとして利用

ホスト（CPU）のメモリ上に定義された配列（ポインタ）

Thrust関数の引数として利用可能#include<stdio.h>#include<thrust/host_vector.h>int main(void){

int vec1[3]; //C言語の標準的な配列thrust::host_vector<int> vec2(3);//thrustのベクトル（ホストに確保）

vec1[0]=10; vec1[1]=20; vec1[2]=30;

thrust::copy(vec1,vec1+3,vec2.begin());//配列名（ポインタ）をthrust関数の//引数として利用


return 0;}



ポインタ（配列）をvectorとして利用

ホスト（CPU）のメモリ上に定義された配列（ポインタ）はThrust関数の引数として利用可能

デバイス（GPU）のメモリ上に定義された配列（ポインタ）はdevice_ptr型を利用して変換

thrust/device_ptr.hで定義



#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/device_vector.h>#include<thrust/device_ptr.h>int main(void){

int *vec1; //GPU上に確保するメモリへのポインタthrust::host_vector<int> vec2(3); //thrustのベクトル（ホストに確保）

cudaMalloc( (void **)&vec1, 3*sizeof(int) );//GPU上にメモリを確保thrust::device_ptr<int> dev_vec1(vec1);//device_ptr型変数dev_vec1を宣言し，

//vec1を包み隠すdev_vec1[0]=10; dev_vec1[1]=20; dev_vec1[2]=30; //dev_vec1をベクトルとして利用

thrust::copy(dev_vec1,dev_vec1+3,vec2.begin()); //関数の引数として利用for(int i=0;i<3;i++)

printf("%d¥n",vec2[i]);

cudaFree(vec1);//vec1は解放が必要return 0;

}

並び替え


GPUでは高速な実装が難しい処理の一つ#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/sort.h>int main(void){

thrust::host_vector<int> vec1(3);

vec1[0]=30;vec1[1]=10;vec1[2]=20;

thrust::sort(vec1.begin(),vec1.end());//並び替えを実行

for(int i=0;i<3;i++)printf("%d¥n",vec1[i]); //10,20,30の順に表示される

return 0;}

総和


GPUでは高速な実装が難しい処理の一つ#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/reduce.h>int main(void){

thrust::host_vector<int> vec1(3);

vec1[0]=30;vec1[1]=10;vec1[2]=20;

int sum = thrust::reduce(vec1.begin(),vec1.end());//総和を計算

printf("%d¥n",sum); //60が表示される

return 0;}

vectorに対する任意の処理


様々な機能を提供するイテレータを利用

constant_iterator 一定値を返すイテレータ

vectorの値を0で初期化する場合などに有効

counting_iterator 初期値と増分を指定すると，連続的に変化する値を返すイテレータ

1,2,3,4･･･など連続的な値の生成に有効

transform_iterator vectorの各要素に対する処理を定め，vectorの全要素に所定の処理を施すイテレータ

関数呼出を模擬

constant_iterator


一定値を返すイテレータ

vectorの値を0で初期化する場合などに有効

fill

#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/constant_iterator.h>int main(void){

int N=10;thrust::host_vector<int> vec(N);

thrust::constant_iterator<int> itr_cnst(0);//一定値（ここでは0）を返すイテレータ

thrust::copy(itr_cnst, itr_cnst+N, vec.begin());//イテレータがN個のデータを//生成し，vecに書込

for(int i=0;i<N;i++)printf("vec(%d) = %d¥n", i, vec[i]);

return 0;}

一定値

constant_iterator


thrust::fill()を使ったvectorの初期化

#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/constant_iterator.h>int main(void){


thrust::fill(vec.begin(),vec.end(), (int)10);//vecの全要素の値に10を設定


return 0;}

一定値

counting_iterator


連続的に変化する値を返すイテレータ

1,2,3,4･･･など連続的な値の生成に有効

fill

#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/counting_iterator.h>int main(void){


thrust::counting_iterator<int> itr_count(0);//0から1ずつ連続的に変化する値を//返すイテレータ

thrust::copy(itr_count, itr_count+N, vec.begin());//イテレータがN個のデータを//生成し，vecに書込


return 0;}

初期値

counting_iterator


thrust::sequence()を使ったvectorの初期化

#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/sequence.h>int main(void){


thrust::sequence(vec.begin(),vec.end());//vec[0]に0,vec[1]に1,...を設定


return 0;}

transform_iterator


vectorの全要素に所定の処理を施すイテレータ

関数呼出を模擬#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/transform_iterator.h>#include<thrust/sequence.h>int main(void){

int N=10;thrust::host_vector<int> vec(N);thrust::sequence(vec.begin(),vec.end());//vecに連続的な値を設定//vecに対して定められた処理を実行した結果を返すイテレータthrust::transform_iterator<処理,thrust::host_vector<int>::iterator,int>

itr_trns(vec.begin(),処理);thrust::copy(itr_trns, itr_trns+N, vec.begin());for(int i=0;i<N;i++)

printf("vec(%d) = %d¥n", i, vec[i]);

return 0;}

所定の処理をどう定めるか？

transform_iterator


所定の処理をどのように記述するか

関数オブジェクト（ファンクタ, Functor）を利用

operatorメンバ関数が定義されたクラス

クラスのインスタンス名が関数になる

#include<stdio.h>struct negate{

int operator()(const int x){return ‐x;

}};int main(void){

negate func;for(int i=0;i<10;i++)

printf("%d, %d¥n", i, func(i));//iの値，iに‐1をかけた値が表示される

return 0;}

transform_iterator


#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/transform_iterator.h>#include<thrust/sequence.h>struct negate{//入力された値に‐1をかけた値を返す

__host__ __device__ int operator()(const int x){return ‐x;

}};

int main(void){int N=10;thrust::host_vector<int> vec(N);thrust::sequence(vec.begin(),vec.end());//vecに0,1,2,3,...を設定//vecの要素に対してnegate()を実行した結果を返すイテレータthrust::transform_iterator<negate,thrust::host_vector<int>::iterator,int>

itr_trns(vec.begin(),negate());thrust::copy(itr_trns, itr_trns+N, vec.begin());

for(int i=0;i<N;i++) printf("vec(%d) = %d¥n", i, vec[i]);return 0;

}

transform関数


#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/transform_iterator.h>#include<thrust/sequence.h>struct negate{//入力された値に‐1をかけた値を返す

__host__ __device__ int operator()(const int x){return ‐x;

}};int main(void){

int N=10;thrust::host_vector<int> vec(N);//vecに0,1,2,3,...を設定thrust::sequence(vec.begin(),vec.end());//vecの要素に対してnegate()を実行した結果をvecに代入thrust::transform(vec.begin(), vec.end(), vec.begin(), negate());

for(int i=0;i<N;i++) printf("vec(%d) = %d¥n", i, vec[i]);return 0;

}


なんかめんどくさくない？

l2ノルムの計算（単純な実装）


#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/sequence.h>struct square{//入力された値の2乗を返す

__host__ __device__ int operator()(int x){return x*x;

}};int main(void){//vec={1,2,3,4,...,N}のl2ノルムを計算するプログラム

int N=10;thrust::host_vector<int> vec(N);//ベクトルの値（1,2,3,4,...）を保持thrust::host_vector<int> temp(N);//ベクトルの値の2乗（1,4,9,16,...）を保持//メモリの読み書き write N, read N + write N, read N

thrust::sequence(vec.begin(),vec.end(),1);//vecに1,2,3,4,...を設定thrust::transform(vec.begin(),vec.end(),temp.begin(),square());//vecの2乗float snrm2 = sqrt(thrust::reduce(temp.begin(),temp.end()));//総和と平方根

printf("%f¥n",snrm2);return 0;

} 無駄が多い書き方

l2ノルムの計算（改良版１）


#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/counting_iterator.h>#include<thrust/iterator/transform_iterator.h>struct square{//入力された値の2乗を返す


}};int main(void){

int N=10;thrust::counting_iterator<int> itr_count(1);//1,2,3,4,...を返すイテレータ//counting iteratorから返される値を2乗するイテレータ

thrust::transform_iterator<square, thrust::counting_iterator<int>, int> itr_trans(itr_count,square());

//メモリへの読み書き read Nfloat snrm2 = sqrt(thrust::reduce(itr_trans, itr_trans+N));//総和と平方根printf("%f¥n",snrm2);return 0;

} まだ簡略化できる

l2ノルムの計算（改良版２）


#include<stdio.h>#include<thrust/host_vector.h>#include<thrust/iterator/counting_iterator.h>#include<thrust/iterator/transform_iterator.h>struct square{//入力された値の2乗を返す


}};

int main(void){int N=10;//メモリへの読み書き read N

float snrm2 = sqrt(thrust::transform_reduce(thrust::counting_iterator<int>(1),thrust::counting_iterator<int>(N+1), square(), (float)0, thrust::plus<float>()));

printf("%f¥n",snrm2);return 0;

}

ベクトル和（Functorを利用）


#include<thrust/device_vector.h>#include<thrust/transform.h>#include<thrust/sequence.h>#include<iostream>struct saxpy{//a*x + yを返す

float a; //aの値saxpy(float _a) : a(_a){} //コンストラクタ__host__ __device__ float operator()(float x, float y){

return a*x + y;}

};int main(void){

int N=256;thrust::device_vector<float> x(N), y(N);thrust::sequence(x.begin(),x.end(),1); //xに1,2,3,4,...を設定thrust::fill(y.begin(),y.end(),0); //yを0に初期化

//ベクトルx,yの全要素をsaxpyに渡して2.0*x+yを計算し，結果をyに代入thrust::transform(x.begin(),x.end(),y.begin(),y.begin(),saxpy(2.0));

for(int i=0;i<N;i++) std::cout<< "y("<<i<<")="<<y[i]<<std::endl;return 0;

}

ベクトル和（placeholdersを利用）


#include<thrust/device_vector.h>#include<thrust/transform.h>#include<thrust/sequence.h>#include<iostream>

int main(void){int N=256;thrust::device_vector<float> x(N), y(N);thrust::sequence(x.begin(),x.end(),1);thrust::fill(y.begin(),y.end(),0);

//ベクトルx,yの全要素で2.0*x+yを計算し，結果をyに代入//thrust::placeholdersを利用する事で，引数のベクトルに対する処理を直接記述（ファンクタが不要）

thrust::transform(x.begin(),x.end(),y.begin(),y.begin(),2.0*thrust::placeholders::_1+thrust::placeholders::_2);

for(int i=0;i<N;i++)std::cout<< "y("<<i<<")="<<y[i]<<std::endl;

return 0;}

Thrustライブラリを利用した実装効率化

共役勾配法のアルゴリズム（再掲）


連立一次方程式Ax=bに対する共役勾配法

Compute r(0)=b−Ax(0). Set p(0)=0,c2(0)=0.

For k=1,…, until ||r||/||b|| < , Do

p(k) = r(k)+c2(k−1)p(k−1)

c1(k) = (r(k), r(k))/(p(k), Ap(k))

x(k+1) = x(k)+c1(k)p(k)

r(k+1) = r(k)−c1(k)Ap(k)

c2(k) = (r(k+1), r(k+1))/{c1

(k)(p(k), Ap(k))}

EndDo

Ap 係数行列Aとベクトルpの積

( , ) ベクトル同士の内積

A 係数行列x 解ベクトルb 右辺ベクトルr 残差ベクトルp 補助ベクトル||・|| l2−ノルム

共役勾配法の各処理の置き換え


共役勾配法に必要な処理とcuBLAS/cuSPARSE関数の対応

処理関数

ベクトルの代入(y[]=x[]) cublas<>copy

ベクトルのスカラ倍(y[]=a*y[]) cublas<>scal

ベクトル同士の和(y[]=a*x[]+y[]) cublas<>axpy

ベクトルの内積(y=x[]·x[]) cublas<>dot

行列－ベクトル積(y[]=A[][]x[]) cusparse<>csrmv

GPUプログラム（制御部分）


#include <stdlib.h>#include <stdio.h>#include <cublas_v2.h>#include <cusparse.h>int main(void){

int N = 1 << 10; //未知数の数210const double err_tol = 1e‐9; //許容誤差const int max_ite = 1<<20; //反復回数の上限

double *x; //近似解ベクトルdouble *b; //右辺ベクトルdouble *Aval; //係数行列Aの情報int *AcolIdx; //要素数の列の情報int *ArowPtr; //行にある要素数の情報int NonZero; //非ゼロ要素数double *sol; //厳密解

//GPU用変数double *d_x;double *d_b;double *d_Aval; //GPU用変数int *d_AcolIdx; //int *d_ArowPtr; //

cg_cusparse.cu

（cuSPARSE版再掲）



double *d_r, rr;double *d_p, *d_Ax;double c1, c2, minusC1, dot;

//cublas,cusparseで利用する定数const double one = 1.0;const double zero = 0.0;const double minusOne = ‐1.0;int i, k;

NonZero = 2 + (N‐2)*3 + 2;Aval = (double *)malloc(sizeof(double)*NonZero);AcolIdx = (int *)malloc(sizeof(int) *NonZero);ArowPtr = (int *)malloc(sizeof(int) *N+1 );x = (double *)malloc(sizeof(double)*N);b = (double *)malloc(sizeof(double)*N);sol = (double *)malloc(sizeof(double)*N);

for (i = 0; i < N; i++){sol[i] = (double)i; //厳密解を設定

x[i] = 0.0; //近似解を0で初期化} cg_cusparse.cu




//係数行列Aの生成setTridiagonalMatrix(Aval, AcolIdx, ArowPtr, N, NonZero);//右辺ベクトルbの生成setRightHandSideVector(b, Aval, AcolIdx, ArowPtr, sol, N);

//cuBLASハンドルの生成cublasHandle_t cublasHandle = 0;cublasStatus_t cublasStatus;cublasStatus=cublasCreate(&cublasHandle);

//cuSPARSハンドルの生成cusparseHandle_t cusparseHandle = 0;cusparseStatus_t cusparseStatus;cusparseStatus = cusparseCreate(&cusparseHandle);

//疎行列の詳細情報を扱う変数descrを生成cusparseMatDescr_t descr = 0;cusparseStatus = cusparseCreateMatDescr(&descr);cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); //行列の種類の指定cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);//配列が0開始であることを明記

cg_cusparse.cu




//GPU上にメモリを確保cudaMalloc((void **)&d_Aval , NonZero*sizeof(double));cudaMalloc((void **)&d_AcolIdx, NonZero*sizeof(int) );cudaMalloc((void **)&d_ArowPtr, (N+1) *sizeof(int) );cudaMalloc((void **)&d_x, N *sizeof(double));cudaMalloc((void **)&d_b, N *sizeof(double));cudaMalloc((void **)&d_r, N *sizeof(double));cudaMalloc((void **)&d_p, N *sizeof(double));cudaMalloc((void **)&d_Ax,N *sizeof(double));

//GPU上のメモリに設定したデータをコピーcublasSetVector(NonZero, sizeof(double), Aval , 1, d_Aval , 1);cublasSetVector(NonZero, sizeof(int) , AcolIdx, 1, d_AcolIdx, 1);cublasSetVector(N+1 , sizeof(int) , ArowPtr, 1, d_ArowPtr, 1);cublasSetVector(N, sizeof(double), x, 1, d_x, 1);cublasSetVector(N, sizeof(double), b, 1, d_b, 1);

// :// ここで共役勾配法を実行// :

cg_cusparse.cu




//ハンドルの破棄cublasDestroy(cublasHandle);cusparseDestroy(cusparseHandle);

//確保したメモリを解放free(x);free(b);free(Aval);free(AcolIdx);free(ArowPtr);free(sol);cudaFree(d_x);cudaFree(d_b);cudaFree(d_Aval);cudaFree(d_AcolIdx);cudaFree(d_ArowPtr);cudaFree(d_r);cudaFree(d_p);cudaFree(d_Ax);

} cg_cusparse.cu


GPUプログラム（共役勾配法部分）



cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, one, descr, d_Aval, d_ArowPtr, d_AcolIdx, d_x, zero, d_Ax);

//cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, NonZero,// &one, descr, d_Aval, d_ArowPtr, d_AcolIdx, d_x, &zero, d_Ax);cublasDcopy(cublasHandle, N, d_b, 1, d_r, 1);cublasDaxpy(cublasHandle, N, &minusOne, d_Ax, 1, d_r, 1);//残差ベクトルの内積を計算cublasDdot(cublasHandle, N, d_r, 1, d_r, 1, &rr);


if (k == 1){//p(k) = r(k)+c2

(k−1)p(k−1) c2とpが0のためp(k) = r(k)cublasDcopy(cublasHandle, N, d_r, 1, d_p, 1);

}else{

c2 = rr / (c1*dot);//p(k) = r(k)+c2

(k−1)p(k−1)

cublasDscal(cublasHandle, N, &c2, d_p, 1);cublasDaxpy(cublasHandle, N, &one, d_r, 1, d_p, 1);

}

cusparseが新しいバージョンの場合

cg_cusparse.cu




//(p(k), Ap(k))を計算//行列ベクトル積Apを実行し，結果とpの内積cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N,

one, descr, d_Aval, d_ArowPtr, d_AcolIdx, d_p, zero, d_Ax);cublasDdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot);c1 = rr / dot;

//x(k+1) = x(k)+c1(k)p(k)

//r(k+1) = r(k)−c1(k)Ap(k)

cublasDaxpy(cublasHandle, N, &c1, d_p, 1, d_x, 1);cublasDaxpy(cublasHandle, N, &minusC1, d_Ax, 1, d_r, 1);

//残差ベクトルの内積を計算cublasDdot(cublasHandle, N, d_r, 1, d_r, 1, &rr);

cudaThreadSynchronize();k++;

}

//計算結果をGPUからCPUへコピーcublasGetVector(N, sizeof(double), d_x, 1, x, 1); cg_cusparse.cu


Thrust導入の狙い


vectorの利用

配列をvectorに置き換え

明示的なメモリ確保やコピーを排除

冗長な処理を排除

transform関数とファンクタを利用

新しいバージョンではplaceholdersが利用できるのでファンクタすら不要

r(0)=b−Ax(0)の置き換え


cuBLASはベクトルを2個までしか扱えない

3段階に分けて計算

行列－ベクトル積を計算 r(0)=b−Ax(0)

計算した結果を別の配列（例えばAx）に保持

cublasDgemv

bの値をrに代入 r(0)=r−Ax(0)

bを使う箇所をrに置き換えることができる

cublasDcopy

ベクトル和を計算 r(0)=r−1×Ax(0)

cublasDaxpy

r(0)=b−Ax(0)の置き換え


Thrustのtransformはベクトル3個を扱える

cuBLASでは分ける必要があった計算を分けずに計算

行列－ベクトル積だけはcuSPARSEを使って別途計算

//Functortemplate<typename T> struct axpy{

T a;axpy(T _a) :a(_a){}

__host__ __device__ T operator()(const T x, const T y){//y <‐ a*x + yreturn a*x + y;

}};//thrust::transform関数を用いた記述//r=b−1×Axthrust::transform(Ax.begin(),Ax.end(),b.begin(),r.begin(),axpy<double>(‐1.0));

p(k) = r(k)+c2(k−1)p(k−1)の置き換え


k=1 c2=0のため単純な代入 p(k) = r(m)

cublasDcopy

k>1 ベクトル和を計算 p(k) = r(k)+c2

(k−1)p(k−1)

またもや難題

cublasDaxpyでは足されるベクトル(p)への操作はできない

2段階に分けて処理を実行

pの値をc2倍 p(k)=c2×p(k−1)

cublasDscal ベクトル和を計算 p(k) = p(k)+1×r(k)

cublasDaxpy

p(k) = r(k)+c2(k−1)p(k−1)の置き換え


k>1 ベクトル和を計算 p(k) = r(k)+c2

(k−1)p(k−1)

Thrustのtransformは任意の処理を実行できる

k=1は単純な代入のため代入演算子を利用

//Functortemplate<typename T> struct xpay{

T a;xpay(T _a) :a(_a){}

__host__ __device__ T operator()(const T x, const T y){//y <‐ x + a*yreturn x + a*y;

}};//thrust::transform関数を用いた記述//p = r+c2pthrust::transform(r.begin(), r.end(), p.begin(), p.begin(), xpay<double>(c2));



#include <stdlib.h>#include <stdio.h>#include <cublas_v2.h> //cuBLASはもう使わない#include <cusparse.h> //cuSPARSEは行列－ベクトル積のみ使う

//thrustのヘッダファイル群#include<thrust/host_vector.h>#include<thrust/device_vector.h>#include<thrust/iterator/transform_iterator.h>#include<thrust/sequence.h>#include<thrust/inner_product.h>

int main(void){

int N = 1 << 20; //未知数の数220const double err_tol = 1e‐9; //許容誤差const int max_ite = 1<<20; //反復回数の上限int NonZero = 2 + (N‐2)*3 + 2; //非ゼロ要素数

cg_thrust.cu



thrust::host_vector<double> x(N); //近似解ベクトルthrust::host_vector<double> b(N); //右辺ベクトルthrust::host_vector<double> Aval(Nnz); //係数行列Aの情報thrust::host_vector<int> AcolIdx(Nnz); //要素数の列の情報thrust::host_vector<int> ArowPtr(N+1); //行にある要素数の情報thrust::host_vector<double> sol(N); //厳密解

double c1, c2, rr, dot;int k;const double one = 1.0;const double zero = 0.0;

thrust::fill(b.begin(), b.end(), (double)0); //右辺ベクトルbを初期化thrust::fill(x.begin(), x.end(), (double)0); //近似解ベクトルxを初期化thrust::sequence(sol.begin(), sol.end()); //厳密解ベクトルを0,1,2,...と初期化

cg_thrust.cu



//係数行列と右辺ベクトルを生成//vectorをポインタにキャストすることで既存の関数を再利用setTridiagonalMatrix(thrust::raw_pointer_cast(&Aval[0]),

thrust::raw_pointer_cast(&AcolIdx[0]),thrust::raw_pointer_cast(&ArowPtr[0]), N, NonZero);

setRightHandSideVector(thrust::raw_pointer_cast(&b[0]),thrust::raw_pointer_cast(&Aval[0]),thrust::raw_pointer_cast(&AcolIdx[0]),thrust::raw_pointer_cast(&ArowPtr[0]),thrust::raw_pointer_cast(&sol[0]), N);

//cuSPARSハンドルの生成cusparseHandle_t cusparseHandle = 0;cusparseStatus_t cusparseStatus;cusparseStatus = cusparseCreate(&cusparseHandle);//疎行列の詳細情報を扱う変数descrを生成cusparseMatDescr_t descr = 0;cusparseStatus = cusparseCreateMatDescr(&descr);cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); //行列の種類の指定cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);//配列が0開始であることを明記

cg_thrust.cu



//GPU上の確保するvectorを宣言//変数名の頭にdev_を付けることで，host_vectorともポインタとも区別thrust::device_vector<double> dev_x = x;thrust::device_vector<double> dev_b = b;thrust::device_vector<double> dev_Aval = Aval;thrust::device_vector<int> dev_AcolIdx = AcolIdx;thrust::device_vector<int> dev_ArowPtr = ArowPtr;

thrust::device_vector<double> dev_r(N);thrust::device_vector<double> dev_p(N);thrust::device_vector<double> dev_Ax(N);

//行列－ベクトル積（cuSPARSE）で使うポインタ//変数名はcg_cusparse.cuから変更せず，ソースファイルの修正量を抑えるdouble *d_x = thrust::raw_pointer_cast(&dev_x[0]);double *d_Aval = thrust::raw_pointer_cast(&dev_Aval[0]);int *d_AcolIdx = thrust::raw_pointer_cast(&dev_AcolIdx[0]);int *d_ArowPtr = thrust::raw_pointer_cast(&dev_ArowPtr[0]);double *d_p = thrust::raw_pointer_cast(&dev_p[0]);double *d_Ax = thrust::raw_pointer_cast(&dev_Ax[0]);

cg_thrust.cu



// :// ここで共役勾配法を実行// :

//ハンドルの破棄cusparseDestroy(cusparseHandle);

//確保したメモリの解放は不要（vectorは自動でメモリを解放）

}

cg_thrust.cu

GPUプログラム（ファンクタ）


//main関数より上で定義

template<typename T> struct axpy{//a*x + yを返すT a;axpy(T _a) :a(_a){}

__host__ __device__ T operator()(const T x, const T y){//y <‐ a*x + yreturn a*x + y;

}};

template<typename T> struct xpay{//x + a*yを返すT a;xpay(T _a) :a(_a){}

__host__ __device__ T operator()(const T x, const T y){//y <‐ x + a*yreturn x + a*y;

}};

cg_thrust.cu





thrust::transform(dev_Ax.begin(), dev_Ax.end(), dev_b.begin(), dev_r.begin(), axpy<double>(‐1.0));

//thrustのバージョンが新しい場合は ‐1.0*thrust::placeholders::_1 + thrust::placeholders::_2//残差ベクトルの内積を計算rr = thrust::inner_product(dev_r.begin(), dev_r.end(), dev_r.begin(), 0.0);


if (k == 1){ //p(k) = r(k)+c2(k−1)p(k−1) c2とpが0のためp(k) = r(k)

dev_p = dev_r;}else{ //p(k) = r(k)+c2

(k−1)p(k−1)

c2 = rr / (c1*dot);thrust::transform(dev_r.begin(), dev_r.end(), dev_p.begin(), dev_p.begin(),

xpay<double>(c2));//thrustのバージョンが新しい場合は thrust::placeholders::_1 + c2*thrust::placeholders::_2

}cg_thrust.cu



//(p(k), Ap(k))を計算//行列ベクトル積Apを実行し，結果とpの内積cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N,

&one, descr, d_Aval, d_ArowPtr, d_AcolIdx, d_p, &zero, d_Ax);dot = thrust::inner_product(dev_p.begin(), dev_p.end(), dev_Ax.begin(), 0.0);c1 = rr / dot;

//x(k+1) = x(k)+c1(k)p(k)

//r(k+1) = r(k)−c1(k)Ap(k)

thrust::transform(dev_p .begin(), dev_p .end(), dev_x.begin(), dev_x.begin(),axpy<double>( c1));

//thrustのバージョンが新しい場合は c1*thrust::placeholders::_1 + thrust::placeholders::_2thrust::transform(dev_Ax.begin(), dev_Ax.end(), dev_r.begin(), dev_r.begin(),

axpy<double>(‐c1));//thrustのバージョンが新しい場合は‐c1*thrust::placeholders::_1 + thrust::placeholders::_2

//残差ベクトルの内積を計算rr = thrust::inner_product(dev_r.begin(), dev_r.end(), dev_r.begin(), 0.0);cudaThreadSynchronize();k++;

}//計算結果をGPUからCPUへコピーx = dev_x;

cg_thrust.cu

誤差の総量の算出


//Functor（main関数より上で定義しておくこと）struct absolute_error{

__host__ __device__double operator()(const double x, const double y){

return fabs(x ‐ y);}

};

//inner_productは単純な内積以外にも，2ベクトル間に対する処理を定義して実行できる//transform_reduceはベクトル1本に対してしか処理しかできないdouble err = 0.0;err = thrust::inner_product

(sol.begin(),sol.end(),x.begin(),0.0, thrust::plus<double>(), absolute_error());

cg_thrust.cu

実行結果（cuBLAS+cuSPARSE, N=210）





実行結果（thrust+cuSPARSE, N=210）





実行時間の比較


ベクトルの次元 CPU [sec/iteration]

cuBLAS+cuSPARSE[sec/iteration]

214 0.8×10‐3 0.3×10‐3

215 1.6×10‐3 0.645×10‐3

216 3.46×10‐3 0.645×10‐3

217 6.92×10‐3 0.625×10‐3

218 13.7×10‐3 0.937×10‐3

219 27.9×10‐3 1.43×10‐3

220 54.6×10‐3 2.86×10‐3

221 110×10‐3 5.17×10‐3

222 219×10‐3 9.66×10‐3

実行時間の比較


ベクトルの次元 cuBLAS+cuSPARSE[sec/iteration]

thrust+cuSPARSE[sec/iteration]

214 0.3×10‐3 0.949×10‐3

215 0.645×10‐3 0.968×10‐3

216 0.645×10‐3 0.983×10‐3

217 0.625×10‐3 1.08×10‐3

218 0.937×10‐3 1.31×10‐3

219 1.43×10‐3 1.93×10‐3

220 2.86×10‐3 3.13×10‐3

221 5.17×10‐3 5.56×10‐3

222 9.66×10‐3 1.00×10‐2

実行時間の比較（cuSPARSE）


ベクトルの次元

1反復あたりの実行時間[sec]

Thrustの“効率”


実行速度はcuBLAS+cuSPARSEよりも数%遅い

ソースコード記述量は大幅に削減

cudaMalloc/cudaFree cublasSetVector/Matrix 既存の配列・ポインタとの互換性保持（raw_pointer_cast）

Thrustを使う利点

ソースコードの可搬性向上

実行環境が異なっていてもそのまま利用可能

コンパイルオプション一つで実行するデバイスを切替

CPU（OpenMP等）とGPU

Thrustのデバイスシステムの選択


これまで

デバイス=GPU

Thrustではいくつかの並列処理の環境を選択可能

コンパイルオプション（プリプロセッサの定義）で指定

‐Dに続けてTHRUST_DEVICE_BACKENDを指定* CUDA (GPU) THRUST_DEVICE_BACKEND_CUDA OpenMP(CPU) THRUST_DEVICE_BACKEND_OMP TBB (CPU) THRUST_DEVICE_BACKEND_TBB

intelが提供するの並列実行ライブラリ

*古いバージョンではTHRUST_DEVICE_BACKENDだったが，現行はTHRUST_DEVICE_SYSTEMを利用する（THRUST_DEVICE_SYSTEM_CUDA等）



GPU(CUDA)を明示的に選択する場合

nvcc cg_thrust.cu ‐DTHRUST_DEVICE_BACKEND=THRUST_DEVICE_BACKEND_CUDA* ‐Dの後ろにスペースは不要

=の前後にスペースは不要

*現行はTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP

thrustのデバイスシステムの選択


CPU上でOpenMPにより並列実行させる場合

device_vectorに対する処理が並列化

host_vectorに対する処理を並列化する場合はTHRUST_HOST_BACKEND=THRUST_HOST_BACKEND_OMPを定義

nvcc cg_thrust.cu ‐DTHRUST_DEVICE_BACKEND=THRUST_DEVICE_BACKEND_OMP* ‐Xcompiler ‐fopenmp ‐lgomp コンパイル方法はこれで問題ないが･･･

cuSPARSEはCPUで実行できないため，対策が必要

#if – #endifディレクティブを使って処理を切替***現行はTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP

**処理はややこしくなるが，#if‐#endifディレクティブを利用する練習（いいわけ）

#if - #endifディレクティブ

#if の後ろに定数式を記述

定数式が真なら#if ‐ #endif間の命令を有効化してコンパイル

defined()でマクロが定義されているかを判別

GPGPU講習会89 2015/10/28

#include<stdio.h>#define DEBUGint main(void){

int a=1,b=2,c=0;

c = a + b;

#if defined(DEBUG) //DEBUGが定義されていればprintfを実行printf("%d, %d, %d¥n",a,b,c);

#endif

return 0;}


定数式に利用できる演算

defined() マクロが定義されているかを判別

比較

a>b, a<b, a==b

論理演算

条件1 && 条件2 条件1 || 条件2 !条件

GPGPU講習会90 2015/10/28



THRUST_DEVICE_BACKEND_CUDAが定義されている場合

THRUST_DEVICE_BACKENDが定義されていない場合

cuSPARSEを有効化

ハンドルの生成や破棄，デスクリプタ作成も含む

THRUST_DEVICE_BACKEND_OMPが定義されている場合

CPUで実行できる行列－ベクトル積の関数を指定

前回の講習会で作った関数を再利用

#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)//GPUで実行する場合のみここを有効化#endif

#if (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_OMP)//OpenMPで並列化する場合のみここを有効化#endif

GPUプログラム


#include <stdlib.h>#include <stdio.h>#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)#include <cusparse.h> //GPUで実行する場合のみ有効化#endif//thrustのヘッダファイル群#include<thrust/host_vector.h>#include<thrust/device_vector.h>#include<thrust/iterator/transform_iterator.h>#include<thrust/sequence.h>#include<thrust/inner_product.h>

//functor，係数行列および右辺ベクトル生成関数を定義//疎行列－ベクトル積を計算する関数（OpenMPで並列化する場合のみ有効化）#if (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_OMP)void computeMxV(double *Ax, double *Aval, int *AcolIdx, int *ArowPtr, double *x, int N){

int i,ij;for (i = 0; i < N; i++){

Ax[i] = 0.0;for (ij = ArowPtr[i]; ij < ArowPtr[i+1]; ij++){

Ax[i] += Aval[ij] * x[AcolIdx[ij]];}

}

}#endif cg_thrust_omp.cu

GPUプログラム


int main(void){

int N = 1 << 20; //未知数の数220const double err_tol = 1e‐9; //許容誤差const int max_ite = 1<<20; //反復回数の上限int NonZero = 2 + (N‐2)*3 + 2; //非ゼロ要素数

thrust::host_vector<double> x(N); //近似解ベクトルthrust::host_vector<double> b(N); //右辺ベクトルthrust::host_vector<double> Aval(Nnz); //係数行列Aの情報thrust::host_vector<int> AcolIdx(Nnz); //要素数の列の情報thrust::host_vector<int> ArowPtr(N+1); //行にある要素数の情報thrust::host_vector<double> sol(N); //厳密解

double c1, c2, rr, dot;int k;const double one = 1.0;const double zero = 0.0;

thrust::fill(b.begin(), b.end(), (double)0); //右辺ベクトルbを初期化thrust::fill(x.begin(), x.end(), (double)0); //近似解ベクトルxを初期化thrust::sequence(sol.begin(), sol.end()); //厳密解ベクトルを0,1,2,...と初期化

cg_thrust_omp.cu

GPUプログラム


//係数行列と右辺ベクトルを生成//vectorをポインタにキャストすることで既存の関数を再利用setTridiagonalMatrix(thrust::raw_pointer_cast(&Aval[0]),

thrust::raw_pointer_cast(&AcolIdx[0]),thrust::raw_pointer_cast(&ArowPtr[0]), N, NonZero);

setRightHandSideVector(thrust::raw_pointer_cast(&b[0]),thrust::raw_pointer_cast(&Aval[0]),thrust::raw_pointer_cast(&AcolIdx[0]),thrust::raw_pointer_cast(&ArowPtr[0]),thrust::raw_pointer_cast(&sol[0]), N);

//GPUで実行する場合のみ有効化#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)

//cuSPARSハンドルの生成cusparseHandle_t cusparseHandle = 0;cusparseStatus_t cusparseStatus;cusparseStatus = cusparseCreate(&cusparseHandle);//疎行列の詳細情報を扱う変数descrを生成cusparseMatDescr_t descr = 0;cusparseStatus = cusparseCreateMatDescr(&descr);cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); //行列の種類の指定cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);//配列が0開始であることを明記

#endif

cg_thrust_omp.cu

GPUプログラム


//GPU上の確保するvectorを宣言//変数名の頭にdev_を付けることで，host_vectorともポインタとも区別thrust::device_vector<double> dev_x = x;thrust::device_vector<double> dev_b = b;thrust::device_vector<double> dev_Aval = Aval;thrust::device_vector<int> dev_AcolIdx = AcolIdx;thrust::device_vector<int> dev_ArowPtr = ArowPtr;

thrust::device_vector<double> dev_r(N);thrust::device_vector<double> dev_p(N);thrust::device_vector<double> dev_Ax(N);

//行列－ベクトル積（cuSPARSE）で使うポインタ//変数名はcg_cusparse.cuから変更せず，ソースファイルの修正量を抑えるdouble *d_x = thrust::raw_pointer_cast(&dev_x[0]);double *d_Aval = thrust::raw_pointer_cast(&dev_Aval[0]);int *d_AcolIdx = thrust::raw_pointer_cast(&dev_AcolIdx[0]);int *d_ArowPtr = thrust::raw_pointer_cast(&dev_ArowPtr[0]);double *d_p = thrust::raw_pointer_cast(&dev_p[0]);double *d_Ax = thrust::raw_pointer_cast(&dev_Ax[0]);

cg_thrust_omp.cu

GPUプログラム



#if (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_OMP)//OpenMPの場合computeMxV(d_Ax, d_Aval, d_AcolIdx, d_ArowPtr, d_x, N);

#endif#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)//CUDAの場合


#endifthrust::transform(dev_Ax.begin(), dev_Ax.end(), dev_b.begin(), dev_r.begin(), axpy<double>(‐1.0));//残差ベクトルの内積を計算rr = thrust::inner_product(dev_r.begin(), dev_r.end(), dev_r.begin(), 0.0);


if (k == 1){ //p(k) = r(k)+c2(k−1)p(k−1) c2とpが0のためp(k) = r(k)

dev_p = dev_r;}else{ //p(k) = r(k)+c2

(k−1)p(k−1)

c2 = rr / (c1*dot);thrust::transform(dev_r.begin(), dev_r.end(), dev_p.begin(), dev_p.begin(), xpay<double>(c2));

}

cg_thrust_omp.cu

GPUプログラム


//(p(k), Ap(k))を計算//行列ベクトル積Apを実行し，結果とpの内積

#if (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_OMP)//OpenMPの場合computeMxV(d_Ax, d_Aval, d_AcolIdx, d_ArowPtr, d_p, N);

#endif#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)//CUDAの場合

cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, &one, descr, d_Aval, d_ArowPtr, d_AcolIdx, d_p, &zero, d_Ax);

#endifdot = thrust::inner_product(dev_p.begin(), dev_p.end(), dev_Ax.begin(), 0.0);c1 = rr / dot;

//x(k+1) = x(k)+c1(k)p(k)

//r(k+1) = r(k)−c1(k)Ap(k)

thrust::transform(dev_p .begin(), dev_p .end(), dev_x.begin(), dev_x.begin(), axpy<double>( c1));thrust::transform(dev_Ax.begin(), dev_Ax.end(), dev_r.begin(), dev_r.begin(), axpy<double>(‐c1));

//残差ベクトルの内積を計算rr = thrust::inner_product(dev_r.begin(), dev_r.end(), dev_r.begin(), 0.0);

#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)cudaThreadSynchronize();//GPUで実行する場合のみ有効化

#endifk++;

}//計算結果をGPUからCPUへコピーx = dev_x; cg_thrust_omp.cu

GPUプログラム


#if !defined(THRUST_DEVICE_BACKEND) || (THRUST_DEVICE_BACKEND==THRUST_DEVICE_BACKEND_CUDA)//ハンドルの破棄cusparseDestroy(cusparseHandle);//GPUで実行する場合のみ有効化

#endif

//確保したメモリの解放は不要（vectorは自動でメモリを解放）

}

cg_thrust_omp.cu



nvcc cg_thrust_omp.cu ‐DTHRUST_DEVICE_BACKEND=THRUST_DEVICE_BACKEND_OMP ‐Xcompiler ‐fopenmp ‐lgomp C/C++しか使っていなくてもnvccでコンパイルする必要があるか？

g++ cg_thrust_omp.cpp ‐fopenmp ‐DTHRUST_DEVICE_BACKEND=THRUST_DEVICE_BACKEND_OMP ‐lgomp ‐I/usr/local/cuda/include/ 既存のC++コンパイラでコンパイル可能

拡張子をcppに変更

CUDAのインクルードディレクトリの指定が必要

まとめ

2015/10/28GPGPU講習会100

Thrustを紹介

CUDA用の並列アルゴリズムライブラリ

GPUだけでなくCPUでも並列実行可能

異なる環境での可搬性を向上

共役勾配法の処理に導入

配列をvectorに置き換え

cudaMalloc/cudaFree，メモリ転送の記述を簡略化

cuBLASの処理を全て置換

実行速度はThrustの方が数％遅い

冗長な書き方はしなくてよくなる

全体のまとめ(GPU 適化ライブラリの利用)

2015/10/28GPGPU講習会101

cuBLAS, cuSPARSE, Thrustを紹介

cuBLAS, cuSPARSE 利用できる処理が決められており，柔軟ではないがその分非常に高速

cuBLASは処理の融通が利かない

cuSPARSEは疎行列の生成が困難

Thrust 非常に柔軟に処理が書けるが，速度はcuBLASに若干劣る

並列処理も便利だが，vectorが非常に便利

コンパイルオプション一つで実行環境を切替可能

gpgpu seminar (gpu accelerated libraries, 3 of 3, thrust)

Engineering