并行与分布式计算导论 第三次作业
Based on markdown 推荐使用markdown阅读器阅读,或访问 并行与分布式计算导论——作业三
作业要求
完成以下MPI程序,提交作业报告,并在报告的最后附源码,格式为PDF。
“大”和“小”
求素数(这题ref程序有问题,不要求对比,可以自己写一个串行的函数验证正确性)
(求pi的那题出了点问题,换成大和小,视图的概念如附件所示)
报告的内容可以包括:
1.并行的核心代码或核心思想;
2.与参考串行/并行程序相比较,或使用不同的线程数量比较输出结果并简单分析;
3.使用MPI的心得或遇到的困难。
"大"和"小"
临湖草堂数据对比
问题规模N | 线程数 | 加速比 | 线程数 | 加速比 | 线程数 | 加速比 |
---|---|---|---|---|---|---|
$10$ | 1 | 0.569 | 2 | 0.209 | 4 | 0.044 |
$12$ | 1 | 0.691 | 2 | 0.937 | 4 | 1.269 |
$14$ | 1 | 1.035 | 2 | 1.406 | 4 | 3.180 |
从上表中可以看出,当问题规模上升,并且线程数上升时,并行计算的时间大幅降低。
林湖草堂输出示例
http://linhucaotang.com/submission/33993
array-size = 2^14
**************stdout from your impl.****************
k2
bufLen:32768, 32768 0
k2
bufLen:32768, 32768 1
k2
bufLen:32768, 32768 2
k5
bufLen:81920, 81920 3
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 121570 RUNNING AT ailab-fx2-server-02
= EXIT CODE: 134
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Aborted (signal 6)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
******************result evaluation*****************
congratulations! the anwser is correct.
***************performance evaluation***************
reference serial impl. : time_cost=8.2295e-05
reference parallel impl.: time_cost=2.3413e-04 speedup=0.351
your impl. : time_cost=1.9550e-05 speedup=4.209
本题难点
- 本题主要是题目的输入输出有明确规定,正确理解题意需要下一番功夫
- 程序中基本上都是以字节数计数的,需要注意
- 当线程数非2的幂次时,程序无法均分,需要特殊处理
源代码
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include "fio.h"
//extern struct FILE_SEG {
//
// int64_t offset;
//
// int64_t width;
//
// int64_t stride;
//
//};
typedef unsigned char byte;
extern int64_t input_data(void *buf, int64_t count, FILE_SEG fseg);
extern int64_t output_data(void *buf, int64_t count, FILE_SEG fseg);
int main(int argc, char **argv) {
int N = atoi(argv[1]);
int num = pow(2, N);
MPI_Init(NULL, NULL);
int rank;
int world;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &world);
int size = num / 2 / world;
FILE_SEG fseg;
fseg.offset = size * rank * sizeof(int64_t);
fseg.width = size * sizeof(int64_t);
fseg.stride = sizeof(int64_t) * (num - size * (2 * rank + 1));
if(rank==world-1) {
fseg.width = (num / 2 - size * rank) * sizeof(int64_t);
fseg.stride=fseg.width;
}
int64_t length = sizeof(int64_t) * num;
int64_t k = 0;
while (k * fseg.stride + fseg.offset < length)
k++;
printf("k%lld\n",k);
int64_t bufLen = k * fseg.width;
byte *data = new byte[bufLen];
int64_t x = input_data(data, bufLen, fseg);
printf("bufLen:%lld, %lld %d \n", bufLen, x, rank);
int wid=fseg.width/ sizeof(int64_t);
for (int i = 0; i < wid; i++) {
if(((int64_t *) data)[i]>((int64_t *) data)[2*wid-1-i]){
//printf("%lld,%lld,",((int64_t *) data)[i],((int64_t *) data)[2*wid-1-i]);
int64_t tmp =((int64_t *) data)[i];
((int64_t *) data)[i]=((int64_t *) data)[2*wid-1-i];
((int64_t *) data)[2*wid-i-1]=tmp;
}
}
//MPI_Barrier(MPI_COMM_WORLD);
//printf("\n");
output_data(data,2*fseg.width,fseg);
printf("Hello: rank %d, world: %d\n", rank, world);
MPI_Finalize();
return 0;
}
求素数
结果比较(本机8代标压酷睿i7-6核心)
问题规模 | 线程数 | 时间(ms) | 线程数 | 时间(ms) | 线程数 | 时间(ms) | 线程数 | 时间(ms) |
---|---|---|---|---|---|---|---|---|
$8$ | 1 | 86 | 2 | 91 | 4 | 105 | 6 | 119 |
$12$ | 1 | 91 | 2 | 96 | 4 | 103 | 6 | 118 |
$15$ | 1 | 91 | 2 | 99 | 4 | 110 | 6 | 127 |
$18$ | 1 | 161 | 2 | 154 | 4 | 166 | 6 | 189 |
从表中可以看出,并行算法并没有加速,这是由于MPI传递信息的时间占比较大,并行加速不明显,另外,从下面部分也可以看出,在本地并非几线程就将CPU满载至百分之几百,这也是时间不准的原因之一。
本地示例输出
# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:14:49]
$ time mpirun -np 6 findPrime 18
mpirun -np 6 findPrime 18 0.35s user 0.13s system 254% cpu 0.189 total
# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:04]
$ time mpirun -np 4 findPrime 18
mpirun -np 4 findPrime 18 0.25s user 0.08s system 199% cpu 0.166 total
# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:14]
$ time mpirun -np 2 findPrime 18
mpirun -np 2 findPrime 18 0.17s user 0.05s system 141% cpu 0.154 total
# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:31]
$ time mpirun -np 1 findPrime 18
mpirun -np 1 findPrime 18 0.13s user 0.03s system 97% cpu 0.161 total
源码
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
int isPrime(int n) {
int flag = 1;
if (n == 1 || n == 2)
return 0;
for (int m = 2; m <= sqrt(n * 1.0); m++) {
if (n % m == 0) {
flag = 0;
break;
}
}
return flag;
}
int main(int argc, char **argv) {
int k = atoi(argv[1]);
FILE *fp;
MPI_Init(NULL, NULL);
int rank;
int world;
int prime[(int) pow(2, 20)];
prime[0] = 0;
prime[1] = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &world);
int size = (int) pow(2, k) / world + 1;
for (int i = 1; i <= size && rank * size + i <= (int) pow(2, k); i++) {
prime[rank * size + i] = isPrime(rank * size + i);
}
//printf("Hello: rank %d, world: %d, from %d to %d\n",rank, world, rank*size+1, rank*size+size);
if (rank != 0) {
MPI_Send(prime, (int) pow(2, k), MPI_INT, 0, 0, MPI_COMM_WORLD);
//printf("%d\n",rank);
} else {
for (int source = 1; source < world; source++) {
int buffer[(int) pow(2, k)];
MPI_Recv(buffer, (int) pow(2, k), MPI_INT, source, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//printf("0 %d\n",source);
for (int i = 0; i < (int) pow(2, k); i++)
prime[i] = (prime[i] || buffer[i]);
}
}
if (rank == 0) {
//fp = fopen("ref.out", "w");
// for(int i=1;i<pow(2,k);i++)
// if(prime[i]) {
// printf("%d\n", i);
// //fwrite(&i, 64, 1, fp);
// }
for (int i = 2; i < pow(2, k); i++)
if (prime[i] != isPrime(i))
printf("False check at %d\n", i);
}
MPI_Finalize();
return 0;
}