#include "dmi_api.h"

typedef struct scaleunit_t
{
  int32_t niter;
  int64_t dmi_barrier_addr;
}scaleunit_t;

void DMI_main(int argc, char **argv)
{
  scaleunit_t scaleunit;
  int32_t niter, init_node_num, thread_num;
  int64_t dmi_barrier_addr, scaleunit_addr;
  
  if(argc != 4)
    {
      outn("usage : %s init_node_num thread_num niter", argv[0]);
      error();
    }
  init_node_num = atoi(argv[1]);
  thread_num = atoi(argv[2]);
  niter = atoi(argv[3]);
  
  catch(DMI_mmap(&dmi_barrier_addr, sizeof(DMI_barrier_t), 1, NULL));
  catch(DMI_mmap(&scaleunit_addr, sizeof(scaleunit_t), 1, NULL));
  catch(DMI_barrier_init(dmi_barrier_addr));
  
  scaleunit.niter = niter;
  scaleunit.dmi_barrier_addr = dmi_barrier_addr;
  catch(DMI_write(scaleunit_addr, sizeof(scaleunit_t), &scaleunit, DMI_EXCLUSIVE, NULL));
  
  catch(DMI_rescale(scaleunit_addr, init_node_num, thread_num));
  
  catch(DMI_barrier_destroy(dmi_barrier_addr));
  catch(DMI_munmap(scaleunit_addr, NULL));
  catch(DMI_munmap(dmi_barrier_addr, NULL));
  return;
}

int32_t DMI_scaleunit(int my_rank, int pnum, int64_t scaleunit_addr)
{
  DMI_local_barrier_t barrier;
  scaleunit_t scaleunit;
  int32_t iter, k;
  double t, t0, t1, d_my_rank, d_sum;
  
  catch(DMI_read(scaleunit_addr, sizeof(scaleunit_t), &scaleunit, DMI_GET, NULL));
  bind_to_cpu(my_rank % PROCNUM);
  catch(DMI_local_barrier_init(&barrier, scaleunit.dmi_barrier_addr));
  
  catch(DMI_local_barrier_sync(&barrier, pnum));
  catch(DMI_local_barrier_sync(&barrier, pnum));
  catch(DMI_local_barrier_sync(&barrier, pnum));
  
  d_my_rank = my_rank;
  catch(DMI_local_barrier_allreduce(&barrier, pnum, &d_my_rank, &d_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(d_sum != pnum * (pnum - 1) / 2) error();
  catch(DMI_local_barrier_allreduce(&barrier, pnum, &d_my_rank, &d_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(d_sum != pnum * (pnum - 1) / 2) error();
  
  for(k = 0; k < 3; k++)
    {
      t = 0;
      for(iter = 0; iter < scaleunit.niter; iter++)
        {
          t0 = get_time();
          catch(DMI_local_barrier_allreduce(&barrier, pnum, &d_my_rank, &d_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
          t1 = get_time();
          t += t1 - t0;
          if(d_sum != pnum * (pnum - 1) / 2) error();
          halt((my_rank + 1) * 0.05 / pnum);
        }
      if(my_rank == pnum - 1)
        {
          outn("%d %.15lf", pnum, t / scaleunit.niter);
        }
      t = 0;
      for(iter = 0; iter < scaleunit.niter; iter++)
        {
          t0 = get_time();
          catch(DMI_local_barrier_allreduce(&barrier, pnum, &d_my_rank, &d_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
          t1 = get_time();
          t += t1 - t0;
          if(d_sum != pnum * (pnum - 1) / 2) error();
        }
      if(my_rank == pnum - 1)
        {
          outn("%d %.15lf", pnum, t / scaleunit.niter);
        }
    }
  
  catch(DMI_local_barrier_destroy(&barrier));
  return 0;
}
