Добавил:
Кафедра ВТ Опубликованный материал нарушает ваши авторские права? Сообщите нам.
Вуз: Предмет: Файл:
2 лаба / lab2.docx
Скачиваний:
5
Добавлен:
07.04.2023
Размер:
749.33 Кб
Скачать

Приложение

Листинг lab1_1.c:

/*No optimization*/ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #define N_do 1000 #define N 10000000 // KPATHO 8 #define N_do_hotter 100 #define N_hotter 10000 // KPATHO 8 float calc_pi(unsigned N_iters); int main() { clock_t begin_cpu, end_cpu; struct timeval begin_time, end_time; /*PA3OrPEB*/ for(unsigned i = 0; i < N_do_hotter; ++i) calc_pi(N_hotter); gettimeofday(&begin_time, 0); begin_cpu = clock(); for(unsigned i = 0; i < N_do; ++i) calc_pi(N); end_cpu = clock(); gettimeofday(&end_time, 0); long time_seconds = end_time.tv_sec - begin_time.tv_sec; long time_microseconds = end_time.tv_usec - begin_time.tv_usec; double time_elapsed = time_seconds + time_microseconds*1e-6; long time_elapsed_mcs = (long)(time_elapsed*1000000 + 0.5); double cputime_spent = (double)(end_cpu - begin_cpu) / CLOCKS_PER_SEC; long cputime_spent_mcs = (long)(cputime_spent*1000000 + 0.5); printf("CPU time spent: %f sec (%ld us)\n", cputime_spent, cputime_spent_mcs); printf("Real time spent: %f sec (%ld us)\n", time_elapsed, time_elapsed_mcs); } float calc_pi(unsigned N_iters) { float pi = 0.0; float x_i = 0.0; for(unsigned i = 0; i < N_iters; ++i) { x_i = (i + 0.5) / N_iters; x_i = 4.0 / (1 + x_i*x_i); pi += x_i; } pi /= N_iters; // printf("%.10lf\n", pi); return pi; }

Листинг lab1_2.c:

/*No optimization*/ /*Data parallelism*/ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <immintrin.h> // AVX #if defined(_MSC_VER) #define ALIGNED_(x) __declspec(align(x)) #else #if defined(__GNUC__) #define ALIGNED_(x) __attribute__ ((aligned(x))) #endif #endif #define N_do 1000 #define N 10000000 // KPATHO 8 #define N_do_hotter 100 #define N_hotter 10000 // KPATHO 8 float calc_pi(unsigned N_iters); int main() { clock_t begin_cpu, end_cpu; struct timeval begin_time, end_time; /*PA3OrPEB*/ for(unsigned i = 0; i < N_do_hotter; ++i) calc_pi(N_hotter); gettimeofday(&begin_time, 0); begin_cpu = clock(); for(unsigned i = 0; i < N_do; ++i) calc_pi(N); end_cpu = clock(); gettimeofday(&end_time, 0); long time_seconds = end_time.tv_sec - begin_time.tv_sec; long time_microseconds = end_time.tv_usec - begin_time.tv_usec; double time_elapsed = time_seconds + time_microseconds*1e-6; long time_elapsed_mcs = (long)(time_elapsed*1000000 + 0.5); double cputime_spent = (double)(end_cpu - begin_cpu) / CLOCKS_PER_SEC; long cputime_spent_mcs = (long)(cputime_spent*1000000 + 0.5); printf("CPU time spent: %f sec (%ld us)\n", cputime_spent, cputime_spent_mcs); printf("Real time spent: %f sec (%ld us)\n", time_elapsed, time_elapsed_mcs); } // https://doc.rust-lang.org/beta/core/arch/x86_64/index.html float calc_pi(unsigned N_iters) { const float N_f = (float)N_iters; float pi = 0.0; ALIGNED_(32) float vres[8]; __m256 onem = _mm256_set1_ps(1.0); __m256 Nm = _mm256_set1_ps(N_f); __m256 buffm; for(unsigned i = 0; i < N_iters; i+=8) { float j = (float)i + 0.5; buffm = _mm256_set_ps(j, j+1.0, j+2.0, j+3.0, j+4.0, j+5.0, j+6.0, j+7.0); buffm = _mm256_div_ps(buffm, Nm); buffm = _mm256_mul_ps(buffm, buffm); buffm = _mm256_add_ps(buffm, onem); buffm = _mm256_div_ps(onem, buffm); buffm = _mm256_hadd_ps(buffm, buffm); buffm = _mm256_hadd_ps(buffm, buffm); _mm256_store_ps(vres, buffm); pi += vres[0] + vres[7]; } pi *= 4.0; pi /= N_iters; // printf("%.10lf\n", pi); return pi; }

Листинг lab1_3.c:

/*No optimization*/ /*Data parallelism*/ /*Thread parallelism*/ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <malloc.h> #include <immintrin.h> // AVX #include <omp.h> #if defined(_MSC_VER) #define ALIGNED_(x) __declspec(align(x)) #else #if defined(__GNUC__) #define ALIGNED_(x) __attribute__ ((aligned(x))) #endif #endif #define N_do 1000 #define N 10000000 // KPATHO 8 #define N_do_hotter 100 #define N_hotter 10000 // KPATHO 8 float calc_pi(unsigned N_iters); int main() { clock_t begin_cpu, end_cpu; struct timeval begin_time, end_time; /*PA3OrPEB*/ for(unsigned i = 0; i < N_do_hotter; ++i) calc_pi(N_hotter); gettimeofday(&begin_time, 0); begin_cpu = clock(); for(unsigned i = 0; i < N_do; ++i) calc_pi(N); end_cpu = clock(); gettimeofday(&end_time, 0); long time_seconds = end_time.tv_sec - begin_time.tv_sec; long time_microseconds = end_time.tv_usec - begin_time.tv_usec; double time_elapsed = time_seconds + time_microseconds*1e-6; long time_elapsed_mcs = (long)(time_elapsed*1000000 + 0.5); double cputime_spent = (double)(end_cpu - begin_cpu) / CLOCKS_PER_SEC; long cputime_spent_mcs = (long)(cputime_spent*1000000 + 0.5); printf("CPU time spent: %f sec (%ld us)\n", cputime_spent, cputime_spent_mcs); printf("Real time spent: %f sec (%ld us)\n", time_elapsed, time_elapsed_mcs); } float calc_pi(unsigned N_iters) { const float N_f = (float)N_iters; float pi = 0.0; // #pragma omp parallel num_threads(5051) #pragma omp parallel { unsigned th_n = omp_get_num_threads(); unsigned th_i = omp_get_thread_num(); unsigned iter_per_th = N_iters / th_n; unsigned lb = iter_per_th*th_i; unsigned ub = 5051; if(th_i == th_n-1) ub = N_iters-1; else ub = lb + iter_per_th-1; float pi_local = 0.0; ALIGNED_(32) float vres[8]; __m256 onem = _mm256_set1_ps(1.0); __m256 Nm = _mm256_set1_ps(N_f); __m256 buffm; for(unsigned i = lb; i <= ub; i+=8) { float j = (float)i + 0.5; buffm = _mm256_set_ps(j, j+1.0, j+2.0, j+3.0, j+4.0, j+5.0, j+6.0, j+7.0); buffm = _mm256_div_ps(buffm, Nm); buffm = _mm256_mul_ps(buffm, buffm); buffm = _mm256_add_ps(buffm, onem); buffm = _mm256_div_ps(onem, buffm); buffm = _mm256_hadd_ps(buffm, buffm); buffm = _mm256_hadd_ps(buffm, buffm); _mm256_store_ps(vres, buffm); pi_local += vres[0] + vres[7]; } #pragma omp atomic pi += pi_local; } pi *= 4.0; pi /= N_iters; // printf("%.10lf\n", pi); return pi; }

Листинг lab1_4.c:

/*No optimization*/ /*Data parallelism*/ /*Thread parallelism*/ /*Thread parallelism*/ /*inline assembly*/ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <malloc.h> #include <omp.h> #if defined(_MSC_VER) #define ALIGNED_(x) __declspec(align(x)) #else #if defined(__GNUC__) #define ALIGNED_(x) __attribute__ ((aligned(x))) #endif #endif #define N_do 1000 #define N 10000000 // KPATHO 8 #define N_do_hotter 100 #define N_hotter 10000 // KPATHO 8 float calc_pi(unsigned N_iters); int main() { clock_t begin_cpu, end_cpu; struct timeval begin_time, end_time; /*PA3OrPEB*/ for(unsigned i = 0; i < N_do_hotter; ++i) calc_pi(N_hotter); gettimeofday(&begin_time, 0); begin_cpu = clock(); for(unsigned i = 0; i < N_do; ++i) calc_pi(N); end_cpu = clock(); gettimeofday(&end_time, 0); long time_seconds = end_time.tv_sec - begin_time.tv_sec; long time_microseconds = end_time.tv_usec - begin_time.tv_usec; double time_elapsed = time_seconds + time_microseconds*1e-6; long time_elapsed_mcs = (long)(time_elapsed*1000000 + 0.5); double cputime_spent = (double)(end_cpu - begin_cpu) / CLOCKS_PER_SEC; long cputime_spent_mcs = (long)(cputime_spent*1000000 + 0.5); printf("CPU time spent: %f sec (%ld us)\n", cputime_spent, cputime_spent_mcs); printf("Real time spent: %f sec (%ld us)\n", time_elapsed, time_elapsed_mcs); } float calc_pi(unsigned N_iters) { const float N_f = (float)N_iters; float pi = 0.0; // #pragma omp parallel num_threads(5051) #pragma omp parallel { unsigned th_n = omp_get_num_threads(); unsigned th_i = omp_get_thread_num(); unsigned iter_per_th = N_iters / th_n; unsigned lb = iter_per_th*th_i; unsigned ub = 5051; if(th_i == th_n-1) ub = N_iters-1; else ub = lb + iter_per_th-1; float pi_local = 0.0; ALIGNED_(32) float vres[8]; // float *v8 = (float*)aligned_alloc(32, 8*sizeof(float)); // float *onev = (float*)aligned_alloc(32, 8*sizeof(float)); // float *Nm = (float*)aligned_alloc(32, 8*sizeof(float)); ALIGNED_(32) float v8[8]; ALIGNED_(32) float onev[8]; ALIGNED_(32) float Nm[8]; onev[0] = 1.0; onev[1] = 1.0; onev[2] = 1.0; onev[3] = 1.0; onev[4] = 1.0; onev[5] = 1.0; onev[6] = 1.0; onev[7] = 1.0; Nm[0] = N_f; Nm[1] = N_f; Nm[2] = N_f; Nm[3] = N_f; Nm[4] = N_f; Nm[5] = N_f; Nm[6] = N_f; Nm[7] = N_f; for(unsigned i = lb; i <= ub; i+=8) { float j = (float)i + 0.5; v8[0] = j ; v8[1] = j+1.0; v8[2] = j+2.0; v8[3] = j+3.0; v8[4] = j + 4.0; v8[5] = j+5.0; v8[6] = j+6.0; v8[7] = j+7.0; __asm__ __volatile__( "vmovaps (%[v8]), %%ymm0 \n\t" "vmovaps (%[Nm]), %%ymm1 \n\t" "vmovaps (%[onev]), %%ymm2 \n\t" "vdivps %%ymm1, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm0, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" "vdivps %%ymm0, %%ymm2, %%ymm0 \n\t" "vhaddps %%ymm0, %%ymm0, %%ymm0 \n\t" "vhaddps %%ymm0, %%ymm0, %%ymm0 \n\t" "vmovaps %%ymm0, %[res] \n\t" : [res] "=m"(*vres) : [v8] "r"(v8), [Nm] "r"(Nm), [onev] "r"(onev) : "%ymm0", "%ymm1", "%ymm2" ); pi_local += vres[0] + vres[7]; } #pragma omp atomic pi += pi_local; // free(v8); free(onev); free(Nm); } pi *= 4.0; pi /= N_iters; // printf("%.10lf\n", pi); return pi; }

Соседние файлы в папке 2 лаба