- •Foreword
- •CUDA installation
- •Installing CUDA environment
- •Measuring GPUs performance
- •Linpack benchmark for CUDA
- •Tests results
- •One Tesla S2050 GPU (428.9 GFlop/s)
- •Two Tesla S2050 GPUs (679.0 GFlop/s)
- •Four Tesla S2050 GPUs (1363 GFlop/s)
- •Two Tesla K20m GPUs (1789 GFlop/s)
- •CUBLAS by example
- •General remarks on the examples
- •CUBLAS Level-1. Scalar and vector based operations
- •cublasIsamax, cublasIsamin - maximal, minimal elements
- •cublasSasum - sum of absolute values
- •cublasScopy - copy vector into vector
- •cublasSdot - dot product
- •cublasSnrm2 - Euclidean norm
- •cublasSrot - apply the Givens rotation
- •cublasSrotg - construct the Givens rotation matrix
- •cublasSscal - scale the vector
- •cublasSswap - swap two vectors
- •CUBLAS Level-2. Matrix-vector operations
- •cublasSger - rank one update
- •cublasStbsv - solve the triangular banded linear system
- •cublasStpsv - solve the packed triangular linear system
- •cublasStrsv - solve the triangular linear system
- •CUBLAS Level-3. Matrix-matrix operations
- •cublasStrsm - solving the triangular linear system
- •MAGMA by example
- •General remarks on Magma
- •Remarks on installation and compilation
- •Remarks on hardware used in examples
- •Magma BLAS
- •LU decomposition and solving general linear systems
- •QR decomposition and the least squares solution of general systems
- •Eigenvalues and eigenvectors for general matrices
- •Eigenvalues and eigenvectors for symmetric matrices
- •Singular value decomposition
3.2 CUBLAS Level-1. Scalar and vector based operations |
|
|
|
30 |
|
printf (" d1 : %7.3 f\n" ,d1 ); |
// |
d1 |
|||
printf (" d2 : %7.3 f\n" ,d2 ); |
// |
d2 |
|||
stat = |
cublasCreate (& handle ); // initialize |
CUBLAS |
context |
||
float |
x1 =1.0 f; |
|
// |
x1 =1 |
|
float |
y1 =2.0 f; |
|
// |
y1 =2 |
|
printf (" x1 : %7.3 f\n" ,x1 ); |
// |
x1 |
|||
printf (" y1 : %7.3 f\n" ,y1 ); |
// |
y1 |
// find |
modified |
Givens |
rotation |
matrix H ={{ h11 , h12 } ,{ h21 , h22 }} |
|
// such |
that |
the |
second |
entry of |
H *{\ sqrt { d1 }* x1 ,\ sqrt { d2 }* y1 }^ T |
// is zero |
|
|
|
|
|
stat=cublasSrotmg(handle,&d1,&d2,&x1,&y1,param); |
|||||
printf (" After |
srotmg :\ n" ); |
|
|||
printf (" param |
[0]: %4.2 f\n" , param [0]); |
||||
printf (" h11 : |
%7.5 f\n" , param [1]); |
||||
printf (" h22 : |
%7.5 f\n" , param [4]); |
||||
// check |
if |
the |
second |
entry of |
H *{\ sqrt { d1 )* x1 ,\ sqrt { d2 }* y1 }^ T |
// is zero ; |
the |
values |
of d1 ,d2 , x1 are overwritten so we use |
//their initial values
printf (" %7.5 f\n" ,( -1.0)* sqrt (5.0)*1.0+
|
|
|
|
|
|
param [4]* sqrt (5.0)*2.0); |
|||
|
cublasDestroy ( handle ); |
|
|
// |
destroy |
CUBLAS context |
|||
|
return |
EXIT_SUCCESS ; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// |
d1 : |
5.000 |
// |
[ d1 ] |
[5] |
[ x1 ] [1] |
[0.5 |
1 ] |
|
// |
d2 : |
5.000 |
// |
[ ]=[ ], |
[ |
]=[ ], |
H =[ |
] |
|
// |
x1 : |
1.000 |
// |
[ d2 ] |
[5] |
[ x2 ] [2] |
[ -1 |
0.5] |
//y1 : 2.000
//After srotmg :
//param [0]: 1.00
//h11 : 0.50000
//h22 : 0.50000
// |
[ sqrt ( d1 )* x1 ] |
[0.5 |
1 ] |
[ sqrt (5)*1] |
[5.59] |
|||
// |
H *[ |
]=[ |
]*[ |
]=[ |
] |
|||
// |
[ sqrt ( d2 )* y1 ] |
[ -1 |
0.5] |
[ sqrt (5)*2] |
[ |
0 ] |
//0.00000 <== the second entry of
//H *{ sqrt ( d1 )* x1 , sqrt ( d2 )* y1 }^ T
3.2.11cublasSscal - scale the vector
This function scales the vector x by the scalar .
x = x:
// nvcc 012 sscal .c - lcublas
#include < stdio .h >
#include < stdlib .h >
3.2 CUBLAS Level-1. Scalar and vector based operations |
31 |
#include < cuda_runtime .h >
#include " cublas_v2 .h"
# define |
n 6 |
|
|
// |
length |
of x |
|||
int main ( void ){ |
|
|
|
|
|
|
|
||
cudaError_t cudaStat ; |
|
// cudaMalloc status |
|||||||
cublasStatus_t stat ; |
|
// CUBLAS functions status |
|||||||
cublasHandle_t handle ; |
// |
CUBLAS |
context |
||||||
int |
j; |
|
|
// index |
of |
elements |
|||
float * |
x; |
|
// n - vector |
on |
the |
host |
|||
x =( float *) malloc (n* sizeof (* x )); // host memory |
alloc |
for x |
|||||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
||
x[j ]=( float )j; |
|
// x ={0 ,1 ,2 ,3 ,4 ,5} |
|||||||
printf ("x :\ n" ); |
|
|
|
|
|
|
|
||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
||
printf (" %2.0 f ," ,x[j ]); |
|
|
|
// |
print x |
||||
printf ("\n" ); |
|
|
|
|
|
|
|
||
// on |
the device |
|
|
|
|
|
|
|
|
float * d_x ; |
|
// d_x - x on the device |
|||||||
cudaStat = cudaMalloc (( void **)& d_x ,n* sizeof (* x )); |
|
// device |
|||||||
|
|
|
|
// memory alloc for x |
|||||
stat |
= |
cublasCreate (& handle ); |
// initialize CUBLAS context |
||||||
stat |
= |
cublasSetVector (n , sizeof (* x),x ,1 , d_x ,1); // |
cp |
x -> d_x |
|||||
float |
al =2.0; |
|
|
|
|
|
// |
al =2 |
|
// scale the vector d_x by the scalar al : d_x = al * d_x |
|
|
|||||||
stat=cublasSscal(handle,n,&al,d |
|
x,1); |
|
|
|
|
|
||
stat = cublasGetVector (n , sizeof ( float ),d_x ,1 ,x ,1); // cp |
d_x ->x |
||||||||
printf ("x after Sscal :\ n" ); |
|
x |
after |
Sscal : |
|||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
||
printf (" %2.0 f ," ,x[j ]); |
|
// x ={0 ,2 ,4 ,6 ,8 ,10} |
|||||||
printf ("\n" ); |
|
|
|
|
|
|
|
||
cudaFree ( d_x ); |
|
// free |
device |
memory |
|||||
cublasDestroy ( handle ); |
|
// destroy |
CUBLAS |
context |
|||||
free (x ); |
|
// free |
host |
memory |
|||||
return EXIT_SUCCESS ; |
|
|
|
|
|
|
|
||
} |
|
|
|
|
|
|
|
|
|
//x:
//0, 1, 2, 3, 4, 5,
//x after Sscal :
// 0, 2, 4, 6, 8 ,10 , |
// 2*{0 ,1 ,2 ,3 ,4 ,5} |
3.2.12cublasSswap - swap two vectors
This function interchanges the elements of vector x and y
x y; y x:
// nvcc 013 sswap .c - lcublas
# include < stdio .h >
3.2 CUBLAS Level-1. Scalar and vector based operations |
32 |
#include < stdlib .h >
#include < cuda_runtime .h >
#include " cublas_v2 .h"
# define n |
6 |
|
|
|
|
|
|
|
// |
length |
of x ,y |
|||||
int main ( void ){ |
|
|
|
|
|
|
|
|
|
|
|
|
||||
cudaError_t cudaStat ; |
|
|
|
|
// cudaMalloc status |
|||||||||||
cublasStatus_t stat ; |
// |
CUBLAS functions status |
||||||||||||||
cublasHandle_t handle ; |
|
|
|
|
|
// |
CUBLAS |
context |
||||||||
int |
j; |
|
|
|
|
|
|
|
// index |
of |
elements |
|||||
float * |
x; |
|
|
|
// n - vector on the host |
|||||||||||
float * |
y; |
|
|
|
// |
n - vector |
on |
the |
host |
|||||||
x =( float *) malloc (n* sizeof (* x )); // |
host |
memory |
alloc |
for |
x |
|||||||||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
|
|
|
|
|
||||
x[j ]=( float )j; |
|
|
|
|
|
// x ={0 ,1 ,2 ,3 ,4 ,5} |
||||||||||
printf ("x :\ n" ); |
|
|
|
|
|
|
|
|
|
|
|
|
||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
|
|
|
|
|
||||
printf (" %2.0 f ," ,x[j ]); |
|
|
|
|
|
|
|
// |
x |
|||||||
printf ("\n" ); |
|
|
|
|
|
|
|
|
|
|
|
|
||||
y =( float *) malloc (n* sizeof (* y )); // |
host |
memory |
alloc |
for |
y |
|||||||||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
|
|
|
|
|
||||
y[j ]=( float )2* j; |
|
|
|
|
|
// y ={0 ,2 ,4 ,6 ,8 ,10} |
||||||||||
printf ("y :\ n" ); |
|
|
|
|
|
|
|
|
|
|
|
|
||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
|
|
|
|
|
||||
printf (" %2.0 f ," ,y[j ]); |
|
|
|
|
|
|
|
// |
y |
|||||||
printf ("\n" ); |
|
|
|
|
|
|
|
|
|
|
|
|
||||
// on |
the |
device |
|
|
|
|
|
|
|
|
|
|
|
|
||
float * d_x ; |
|
|
// d_x - x on the |
device |
||||||||||||
float * d_y ; |
|
|
// d_y - y on the device |
|||||||||||||
cudaStat = cudaMalloc (( void **)& d_x ,n* sizeof (* x )); |
|
// |
device |
|||||||||||||
|
|
|
|
|
|
|
|
|
// memory alloc for x |
|||||||
cudaStat = cudaMalloc (( void **)& d_y ,n* sizeof (* y )); |
|
// |
device |
|||||||||||||
|
|
|
|
|
|
|
|
|
// memory |
alloc |
for |
y |
||||
stat |
= |
cublasCreate (& handle ); |
// initialize CUBLAS context |
|||||||||||||
stat |
= |
cublasSetVector (n , sizeof (* x),x ,1 , d_x ,1); // |
cp |
x -> d_x |
||||||||||||
stat |
= |
cublasSetVector (n , sizeof (* y),y ,1 , d_y ,1); // |
cp |
y -> d_y |
||||||||||||
// swap the vectors d_x , d_y : |
d_x <--d_y , |
d_y <-- d_x |
|
|
|
|
||||||||||
stat=cublasSswap(handle,n,d |
|
x,1,d |
|
y,1); |
|
|
|
|
|
|
|
|
||||
stat = cublasGetVector (n , sizeof ( float ),d_y ,1 ,y ,1); // cp |
d_y ->y |
|||||||||||||||
stat = cublasGetVector (n , sizeof ( float ),d_x ,1 ,x ,1); // cp |
d_x ->x |
|||||||||||||||
printf ("x after Sswap :\ n" ); |
|
|
|
// |
x |
after |
Sswap : |
|||||||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
|
|
|
|
|
||||
printf (" %2.0 f ," ,x[j ]); |
|
|
|
|
|
// x ={0 ,2 ,4 ,6 ,8 ,10} |
||||||||||
printf ("\n" ); |
|
|
|
|
|
|
|
|
|
|
|
|
||||
printf ("y after Sswap :\ n" ); |
|
|
|
// |
y |
after |
Sswap : |
|||||||||
for (j =0;j <n;j ++) |
|
|
|
|
|
|
|
|
|
|
|
|
||||
printf (" %2.0 f ," ,y[j ]); |
|
|
|
|
|
// y ={0 ,1 ,2 ,3 ,4 ,5} |
||||||||||
printf ("\n" ); |
|
|
|
|
|
|
|
|
|
|
|
|
||||
cudaFree ( d_x ); |
|
|
|
|
// |
free |
device |
memory |
||||||||
cudaFree ( d_y ); |
|
|
|
|
// |
free |
device |
memory |
||||||||
cublasDestroy ( handle ); |
// |
destroy |
CUBLAS |
context |