其他分享
首页 > 其他分享> > c – 多线程基准测试问题

c – 多线程基准测试问题

作者:互联网

我编写了一个代码,可以从2×2到50×50的尺寸随机生成两个矩阵.然后我记录从尺寸2到50的每个矩阵乘法所花费的时间.我记录这个时间100次以获得每个情况2 -50的良好平均值.程序首先通过顺序乘以矩阵开始,并在csv文件中记录平均执行时间.然后使用pthreads继续并行矩阵乘法,并将平均执行时间记录在单独的csv文件中.我的问题是顺序乘法的平均执行时间比并行执行要短很多.对于大小为50的矩阵,顺序乘法需要500微秒,并行乘法需要2500微秒.这是一个问题,因为我如何计时代码?或者我的线程实现不能很好地工作,实际上导致代码执行时间更长?我在生成矩阵后启动计时器,并在所有线程连接在一起后停止它.线程代码最初是针对两个不均匀大小的矩阵编写的,因此它实现了负载平衡算法.

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <algorithm>
#include  <vector>
#include <stdlib.h>
#include <pthread.h>
#include <cstdlib>
#include <ctime>
#include <sys/time.h>
#include <chrono>
#include <unistd.h>


using namespace std;
int n,i,j,t,k,l,MAX;
float randomnum,sum1, avg;
float matA[100][100];
float matB[100][100];
float matC[100][100];
struct Loading
{
    int r;
    int c;
    int n;
    int m;
};

// threads
pthread_t threads[100] = { 0 };

// indexes
int indexes[100] = {0};

// load balancing
Loading loads[100] = { 0 };

// for printing in thread
pthread_mutex_t M;

// run thread
void* multi(void* arg)
{
    int index = *((int*)(arg));
    Loading load = loads[index];
    int i = 0;
    int j = 0;
    int k = 0;
    int istart = load.r;
    int jstart = load.c;

    pthread_mutex_lock(&M);
     // cout << "thread #" << index << " pid: " << getpid() << " starting " << " row " << istart << " col "  << jstart << endl;
    pthread_mutex_unlock(&M);

    // logic to balance loads amongst threads using for loop
    int n = load.n;
    for (i = istart; i < MAX; i++)
    {

        for (j =jstart;n > 0 && j < MAX; j++,n--)
        {
            for (k = 0; k < MAX; k++)
            {
                matC[i][j] += matA[i][k] * matB[k][j];
            }

            pthread_mutex_lock(&M);
            //cout << "row " << i << " col "<< j << " value " << matC[i][j] << endl;
            pthread_mutex_unlock(&M);
        }

        jstart = 0;

        if (n == 0)
        {
           pthread_mutex_lock(&M);
          // cout << "thread #" << index << " pid: " << getpid() << " has completed " << endl;
           pthread_mutex_unlock(&M);
           return 0;

        }
    }


    return 0;
}

int num_threads = 0;
int MAX_THREADS= 0;




int main()
{

pthread_mutex_init(&M, NULL);


srand ( time(NULL) );

//for (n=2; n<4; n++) {
ofstream myfile;
    //  myfile.open ("/home/gage/Desktop/timing/seqrecord.csv");
myfile.open ("seqrecord.csv");
      myfile << "testtowork\n";

for (n=2; n<50; n++){
    MAX =n;
    myfile << n <<","; 
  for (int i = 0; i < MAX; i++) {
        for (int j = 0; j < MAX; j++) {
            matA[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
            matB[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
        }
  }
for(t=0; t<101; t++){
 //clock_t startTime = clock();
auto start = chrono::steady_clock::now();
  for (i = 0; i < MAX; ++i)
for (j = 0; j < MAX; ++j)
for (k = 0; k < MAX; ++k)
{
matC[i][j] += matA[i][k] * matB[k][j];
}


//int stop_s=clock();
auto end = chrono::steady_clock::now();
//cout << double( clock() - startTime ) / (double)CLOCKS_PER_SEC/1000000000<< " milli-seconds." << endl;
//cout << chrono::duration_cast<chrono::microseconds>(end - start).count() <<endl;
myfile << chrono::duration_cast<chrono::microseconds>(end - start).count() <<",";
sum1 = sum1+chrono::duration_cast<chrono::microseconds>(end - start).count();

}
avg = sum1 / 100;
myfile << "Average execution" << "," << avg << "\n";
sum1 =0;
avg = 0;


// }
}

myfile.close();
ofstream myfile1;

myfile1.open ("parallel.csv");
      myfile1 << "testtowork\n";





for (n=2; n<51; n++)
{
MAX = n;
MAX_THREADS = n*n;
num_threads =n;
 myfile1 << n <<","; 
  for (int i = 0; i < MAX; i++) {
        for (int j = 0; j < MAX; j++) {
            matA[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
            matB[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
        }
  }
           for(t=0; t<101; t++){
         //clock_t startTime = clock();
            auto start = chrono::steady_clock::now();
         // calculade load balancing

        // cout << "calculation load balancing" << endl;

            double nwhole = (double)MAX_THREADS / num_threads;
            double last = 0;
            double sum = 0;
            int k = 0;

            loads[k].r = 0;
            loads[k].c = 0;
            loads[k].n = 0;


            while (k < num_threads)
            {

                sum = sum + nwhole;

                loads[k].n = (int)sum - (int)last;

                // check last length
                if(k == num_threads-1 && sum != MAX_THREADS)
                {
                    sum=MAX_THREADS;
                    loads[k].n=(int)sum - (int)last;
                }

                // display result
              //  cout << (int)last << " to " << (int)sum << " length: " << (int)sum - int(last) << endl;


                k++;

                if(k < num_threads)
                {
                loads[k].r = ((int)sum) / MAX;
                loads[k].c = ((int)sum) % MAX;
                }


                last = sum;


            }




        //cout << "making threads" << endl;

        void* exit_status;

        int rc;

        for( i = 0; i < num_threads ; i++ ) {
         //     cout << "main() : creating thread, " << i << endl;
              indexes[i] = i;
              rc = pthread_create(&threads[i], NULL, multi, (void *)&indexes[i]);



              if (rc) {
           //      cout << "Error:unable to create thread," << rc << endl;
                 exit(-1);
              }
           }

        // wait for threads to end
        for (j = 0; j < num_threads; j++)
        {

            pthread_join(threads[j], &exit_status);
        }

auto end = chrono::steady_clock::now();
//cout << double( clock() - startTime ) / (double)CLOCKS_PER_SEC/1000000000<< " milli-seconds." << endl;
//cout << chrono::duration_cast<chrono::microseconds>(end - start).count() <<endl;
myfile1 << chrono::duration_cast<chrono::microseconds>(end - start).count() <<",";
sum1 = sum1+chrono::duration_cast<chrono::microseconds>(end - start).count();
 }
 avg = sum1 / 100;
myfile1 << "Average" << "," << avg << "\n";
sum1 =0;
avg = 0;



       }


return 0;
}

解决方法:

首先,您的矩阵大小太小,无法以多线程方式将它们相乘,因为创建线程,上下文切换和连接线程很可能会带来比花费更多时间的开销.对于更大的矩阵大小,你必须测量(我猜它将在50×50左右),与乘法时间相比,线程的开销足够低,因此性能会提高.
此外,您正在创建太多线程.您正在为矩阵的一行创建一个线程,因此开销将是巨大的.如果CPU上有4个内核,则创建4个以上的线程(包括主线程)将导致上下文切换的开销增加.你可以在这里做的是创建几个线程并在线程之间分配数据,所以例如(注意我为了简单起见我使用std :: thread):

int a[50][50];
int b[50][50];
int c[50][50];

void multiply_part_of_matrix(int start, int end) {
  for(int i=start; i < end; ++i) {
    for (int j = 0; j < 50; ++j) {
      c[i][j] = 0;
      for(int k = 0; k < 50; ++i) {
        c[i][j] = a[i][k] * b[k][j];
      }
    }
  }
}


int main() {
  // initializes matrix
  std::vector<std::thread> threads;
  // start time
  for(int i=0; i < 5; ++i) {
    threads.emplace_back(multiply_part_of_matrix, i*10, i*10+10); 
  }

  for(int i = 0; i < 5; ++i) {
    threads.at(i).join();
  }
  // stop time
  return 0;
}

请注意,如果向主线程提供一些数据,它也会提高性能,以便在等待其他线程时不会阻塞(开销).

如果您想进一步提高性能,可以考虑使用不同的算法(Strassen算法)或缓存优化,例如通过循环展开.

标签:c,multithreading,timing,pthreads,benchmarking
来源: https://codeday.me/bug/20190923/1814117.html