并行与分布式计算导论 第三次作业

Based on markdown 推荐使用markdown阅读器阅读,或访问 并行与分布式计算导论——作业三

作业要求

完成以下MPI程序,提交作业报告,并在报告的最后附源码,格式为PDF。
“大”和“小”
求素数(这题ref程序有问题,不要求对比,可以自己写一个串行的函数验证正确性)
(求pi的那题出了点问题,换成大和小,视图的概念如附件所示)
报告的内容可以包括:
1.并行的核心代码或核心思想;
2.与参考串行/并行程序相比较,或使用不同的线程数量比较输出结果并简单分析;
3.使用MPI的心得或遇到的困难。

"大"和"小"

临湖草堂数据对比

问题规模N线程数加速比线程数加速比线程数加速比
$10$10.56920.20940.044
$12$10.69120.93741.269
$14$11.03521.40643.180

从上表中可以看出,当问题规模上升,并且线程数上升时,并行计算的时间大幅降低。

林湖草堂输出示例

http://linhucaotang.com/submission/33993

array-size = 2^14

**************stdout from your impl.****************
k2
bufLen:32768, 32768 0 
k2
bufLen:32768, 32768 1 
k2
bufLen:32768, 32768 2 
k5
bufLen:81920, 81920 3 

===================================================================================
=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
=   PID 121570 RUNNING AT ailab-fx2-server-02
=   EXIT CODE: 134
=   CLEANING UP REMAINING PROCESSES
=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Aborted (signal 6)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

******************result evaluation*****************
congratulations! the anwser is correct.

***************performance evaluation***************
reference serial impl.  : time_cost=8.2295e-05
reference parallel impl.: time_cost=2.3413e-04 speedup=0.351   
your impl.              : time_cost=1.9550e-05 speedup=4.209

本题难点

  1. 本题主要是题目的输入输出有明确规定,正确理解题意需要下一番功夫
  2. 程序中基本上都是以字节数计数的,需要注意
  3. 当线程数非2的幂次时,程序无法均分,需要特殊处理

源代码

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include "fio.h"

//extern struct FILE_SEG {
//
//    int64_t offset;
//
//    int64_t width;
//
//    int64_t stride;
//
//};


typedef unsigned char byte;

extern int64_t input_data(void *buf, int64_t count, FILE_SEG fseg);

extern int64_t output_data(void *buf, int64_t count, FILE_SEG fseg);


int main(int argc, char **argv) {

    int N = atoi(argv[1]);
    int num = pow(2, N);

    MPI_Init(NULL, NULL);
    int rank;
    int world;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world);

    int size = num / 2 / world;

    FILE_SEG fseg;
    fseg.offset = size * rank * sizeof(int64_t);
    fseg.width = size * sizeof(int64_t);
    fseg.stride = sizeof(int64_t) * (num - size * (2 * rank + 1));
    if(rank==world-1) {
        fseg.width = (num / 2 - size * rank) * sizeof(int64_t);
        fseg.stride=fseg.width;
    }

    int64_t length = sizeof(int64_t) * num;
    int64_t k = 0;
    while (k * fseg.stride + fseg.offset < length)
        k++;
    printf("k%lld\n",k);
    int64_t bufLen = k * fseg.width;
    byte *data = new byte[bufLen];
    int64_t x = input_data(data, bufLen, fseg);
    printf("bufLen:%lld, %lld %d \n", bufLen, x, rank);

    int wid=fseg.width/ sizeof(int64_t);
    for (int i = 0; i < wid; i++) {
        if(((int64_t *) data)[i]>((int64_t *) data)[2*wid-1-i]){
            //printf("%lld,%lld,",((int64_t *) data)[i],((int64_t *) data)[2*wid-1-i]);
            int64_t tmp =((int64_t *) data)[i];
            ((int64_t *) data)[i]=((int64_t *) data)[2*wid-1-i];
            ((int64_t *) data)[2*wid-i-1]=tmp;
        }
    }
    //MPI_Barrier(MPI_COMM_WORLD);
    //printf("\n");

    output_data(data,2*fseg.width,fseg);

    printf("Hello: rank %d, world: %d\n", rank, world);

    MPI_Finalize();

    return 0;
}

求素数

结果比较(本机8代标压酷睿i7-6核心)

问题规模线程数时间(ms)线程数时间(ms)线程数时间(ms)线程数时间(ms)
$8$18629141056119
$12$19129641036118
$15$19129941106127
$18$1161215441666189

从表中可以看出,并行算法并没有加速,这是由于MPI传递信息的时间占比较大,并行加速不明显,另外,从下面部分也可以看出,在本地并非几线程就将CPU满载至百分之几百,这也是时间不准的原因之一。

本地示例输出

# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:14:49] 
$ time mpirun -np 6 findPrime 18
mpirun -np 6 findPrime 18  0.35s user 0.13s system 254% cpu 0.189 total

# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:04] 
$ time mpirun -np 4 findPrime 18
mpirun -np 4 findPrime 18  0.25s user 0.08s system 199% cpu 0.166 total

# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:14] 
$ time mpirun -np 2 findPrime 18
mpirun -np 2 findPrime 18  0.17s user 0.05s system 141% cpu 0.154 total

# rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:31] 
$ time mpirun -np 1 findPrime 18
mpirun -np 1 findPrime 18  0.13s user 0.03s system 97% cpu 0.161 total

源码

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>

int isPrime(int n) {
    int flag = 1;
    if (n == 1 || n == 2)
        return 0;
    for (int m = 2; m <= sqrt(n * 1.0); m++) {
        if (n % m == 0) {
            flag = 0;
            break;
        }
    }
    return flag;
}

int main(int argc, char **argv) {

    int k = atoi(argv[1]);
    FILE *fp;


    MPI_Init(NULL, NULL);
    int rank;
    int world;

    int prime[(int) pow(2, 20)];
    prime[0] = 0;
    prime[1] = 0;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world);

    int size = (int) pow(2, k) / world + 1;

    for (int i = 1; i <= size && rank * size + i <= (int) pow(2, k); i++) {
        prime[rank * size + i] = isPrime(rank * size + i);
    }

    //printf("Hello: rank %d, world: %d, from %d to %d\n",rank, world, rank*size+1, rank*size+size);

    if (rank != 0) {
        MPI_Send(prime, (int) pow(2, k), MPI_INT, 0, 0, MPI_COMM_WORLD);
        //printf("%d\n",rank);
    } else {
        for (int source = 1; source < world; source++) {
            int buffer[(int) pow(2, k)];
            MPI_Recv(buffer, (int) pow(2, k), MPI_INT, source, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            //printf("0 %d\n",source);
            for (int i = 0; i < (int) pow(2, k); i++)
                prime[i] = (prime[i] || buffer[i]);
        }

    }

    if (rank == 0) {
        //fp = fopen("ref.out", "w");
//        for(int i=1;i<pow(2,k);i++)
//            if(prime[i]) {
//                printf("%d\n", i);
//                //fwrite(&i, 64, 1, fp);
//            }
        for (int i = 2; i < pow(2, k); i++)
            if (prime[i] != isPrime(i))
                printf("False check at %d\n", i);
    }

    MPI_Finalize();


    return 0;
}
最后修改:2021 年 03 月 13 日
如果觉得我的文章对你有用,请随意赞赏