#include "dmi_api.h"

#define DIR "/data/1/haraken/mat/"

#define DEBUG 0
#define ASSERT 1
#define REALLOC 0

#define MARKER 723
#define MARKER_END 443
#define DUMMY 1e-23

#define EPS 1.0e-12

typedef struct scaleunit_t
{
  int32_t fillin;
  int32_t niter;
  int64_t barrier_addr;
  int64_t am_rwset_addr;
  int64_t em_rwset_addr;
  char dataset[FNAME_SIZE];
}scaleunit_t;

typedef struct am_t
{
  int n;
  int nplu;
  int extn;
  int *rows;
  int *cols;
  int *to;
  int *inner;
  double *ds;
  DMI_local_rwset_t *local_rwset;
}am_t;

typedef struct mm_t
{
  int n;
  int nplu;
  int extn;
  int *lrows;
  int *lcols;
  int *urows;
  int *ucols;
  int *to;
  int *inner;
  double *lds;
  double *uds;
  DMI_local_rwset_t *local_rwset;
}mm_t;

typedef struct data_t
{
  am_t *am;
  am_t *em;
  double *bv;
}data_t;

void decompose(am_t *am, am_t *em, int my_rank, int64_t am_rwset_addr, int64_t em_rwset_addr, DMI_local_barrier_t *local_barrier, int pnum);
data_t* make_data(char *dataset, int my_rank, int pnum);
void free_data(data_t *data);
mm_t* make_mm(am_t *em, int fillin);
void free_mm(mm_t *mm);
void matvec(am_t *am, double *pv, double *qv, double *av, DMI_local_barrier_t *local_barrier, int32_t pnum);
void fwbk(mm_t *mm, double *rv, double *zv, double *ev, DMI_local_barrier_t *local_barrier, int32_t pnum);
void initial(double *bv, double *xv, double *rv, double *r2v, int n);
void daxpy(double *pv, double *qv, double *rv, double scale, int n);
void daxpy2(double *xv, double *p2v, double *s2v, double alpha, double omega, int n);
void daxpy3(double *pv, double *rv, double *qv, double beta, double omega, int n);
void daxpy4(double *pv, double *rv, double *qv, double beta, double omega, int n);
void daxpy5(double *pv, double *rv, double *qv, double beta, double omega, double alpha, int n);
void copy(double *pv, double *qv, int n);
double dot(double *pv, double *qv, int n, DMI_local_barrier_t *local_barrier, int32_t pnum);
double norm(double *pv, int n, DMI_local_barrier_t *local_barrier, int32_t pnum);
void print_calctime(double total, double t, DMI_local_barrier_t *local_barrier, int my_rank, int pnum);

__thread double t_dot = 0;
__thread double t_fwbk = 0;
__thread double t_matvec = 0;
__thread double t_barr = 0;
__thread double t_comm = 0;

__thread double t_calc = 0;

void DMI_main(int argc, char **argv)
{
  FILE *fp;
  scaleunit_t scaleunit;
  char file[FNAME_SIZE];
  char *dataset;
  int n, init_node_num, thread_num, fillin, rank, pnum, ret, num, element_num, niter;
  int64_t scaleunit_addr, barrier_addr, am_rwset_addr, em_rwset_addr;
  
  if(argc != 6)
    {
      errn("usage : %s init_node_num thread_num dataset fillin niter", argv[0]);
      error();
    }
  
  init_node_num = atoi(argv[1]);
  thread_num = atoi(argv[2]);
  dataset = argv[3];
  fillin = atoi(argv[4]);
  niter = atoi(argv[5]);
  pnum = init_node_num * thread_num;
  
#if DEBUG_MMAP
#if DEBUG_BASIC
  fprintf(stderr, "correct page size (scaleunit_addr) : %ld\n", sizeof(scaleunit_t));
#endif
  ret = DMI_mmap(&scaleunit_addr, DMI_AUTOMATIC, sizeof(scaleunit_t) * 1, NULL);
#else
  ret = DMI_mmap(&scaleunit_addr, sizeof(scaleunit_t), 1, NULL);
#endif
  catch(ret);
#if DEBUG_MMAP
#if DEBUG_BASIC
  fprintf(stderr, "correct page size (barrier_addr) : %ld\n", sizeof(DMI_barrier_t));
#endif
  ret = DMI_mmap(&barrier_addr, DMI_AUTOMATIC, sizeof(DMI_barrier_t) * 1, NULL);
#else
  ret = DMI_mmap(&barrier_addr, sizeof(DMI_barrier_t), 1, NULL);
#endif
  catch(ret);
#if DEBUG_MMAP
#if DEBUG_BASIC
  fprintf(stderr, "correct page size (am_rwset_addr) : %ld\n", sizeof(DMI_rwset_t));
#endif
  ret = DMI_mmap(&am_rwset_addr, DMI_AUTOMATIC, sizeof(DMI_rwset_t) * 1, NULL);
#else
  ret = DMI_mmap(&am_rwset_addr, sizeof(DMI_rwset_t), 1, NULL);
#endif
  catch(ret);
#if DEBUG_MMAP
#if DEBUG_BASIC
  fprintf(stderr, "correct page size (em_rwset_addr) : %ld\n", sizeof(DMI_rwset_t));
#endif
  ret = DMI_mmap(&em_rwset_addr, DMI_AUTOMATIC, sizeof(DMI_rwset_t) * 1, NULL);
#else
  ret = DMI_mmap(&em_rwset_addr, sizeof(DMI_rwset_t), 1, NULL);
#endif
  catch(ret);
    
  element_num = 0;
  for(rank = 0; rank < pnum; rank++)
    {
      sprintf(file, "%s/%s_%d_%d.img", DIR, dataset, rank, pnum);
      fp = fopen(file, "rb");
      if(fp == NULL) error();
      
      num = 1;
      ret = fread(&n, sizeof(int), num, fp);
      if(ret != num) error();
      
      element_num += n;
      
      fclose(fp);
    }
  
  catch(DMI_barrier_init(barrier_addr));
  catch(DMI_rwset_init(am_rwset_addr, element_num, sizeof(double) * 3, pnum));
  catch(DMI_rwset_init(em_rwset_addr, element_num, sizeof(double) * 3, pnum));
  
  scaleunit.fillin = fillin;
  scaleunit.niter = niter;
  scaleunit.barrier_addr = barrier_addr;
  scaleunit.am_rwset_addr = am_rwset_addr;
  scaleunit.em_rwset_addr = em_rwset_addr;
  strcpy(scaleunit.dataset, dataset);
  catch(DMI_write(scaleunit_addr, sizeof(scaleunit_t), &scaleunit, DMI_EXCLUSIVE, NULL));
  
  DMI_rescale(scaleunit_addr, init_node_num, thread_num);
  
  catch(DMI_rwset_destroy(em_rwset_addr));
  catch(DMI_rwset_destroy(am_rwset_addr));
  catch(DMI_barrier_destroy(barrier_addr));
  
  catch(DMI_munmap(em_rwset_addr, NULL));
  catch(DMI_munmap(am_rwset_addr, NULL));
  catch(DMI_munmap(barrier_addr, NULL));
  catch(DMI_munmap(scaleunit_addr, NULL));
  return;
}

int32_t DMI_scaleunit(int my_rank, int pnum, int64_t scaleunit_addr)
{
  am_t *am, *em;
  mm_t *mm;
  double *bv;
  data_t *data;
  char dataset[FNAME_SIZE];
  int iter, fillin, n;
  int64_t barrier_addr, am_rwset_addr, em_rwset_addr;
  double rho, rho_old, alpha, beta, bv_norm, xv_norm, bax_norm, rv_norm, res
    , tmpdot0, tmpdot1, tmpdot2, tmpdot3, tmpdot4, tmp, qsi, eta, t_sum;
  double *rv, *r2v, *mrv, *amrv, *pv, *apv, *yv, *mtv, *uv, *auv, *zv, *tv, *xv, *av, *ev;
  scaleunit_t scaleunit;
  DMI_local_barrier_t *local_barrier;
  
  local_barrier = (DMI_local_barrier_t*)my_malloc(sizeof(DMI_local_barrier_t));
  
  catch(DMI_read(scaleunit_addr, sizeof(scaleunit_t), &scaleunit, DMI_GET, NULL));
  bind_to_cpu(my_rank % PROCNUM);
      
  fillin = scaleunit.fillin;
  barrier_addr = scaleunit.barrier_addr;
  am_rwset_addr = scaleunit.am_rwset_addr;
  em_rwset_addr = scaleunit.em_rwset_addr;
  strcpy(dataset, scaleunit.dataset);
  
  catch(DMI_local_barrier_init(local_barrier, scaleunit.barrier_addr));
  
  if(my_rank == 0)
    {
      outn("=== loading data ===");
    }
  
  data = make_data(dataset, my_rank, pnum);
  
  am = data->am;
  em = data->em;
  bv = data->bv;
  
  n = am->n;
  
  rv = (double*)my_malloc(n * 3 * sizeof(double));
  r2v = (double*)my_malloc(n * 3 * sizeof(double));
  mrv = (double*)my_malloc(n * 3 * sizeof(double));
  amrv = (double*)my_malloc(n * 3 * sizeof(double));
  pv = (double*)my_malloc(n * 3 * sizeof(double));
  apv = (double*)my_malloc(n * 3 * sizeof(double));
  yv = (double*)my_malloc(n * 3 * sizeof(double));
  mtv = (double*)my_malloc(n * 3 * sizeof(double));
  uv = (double*)my_malloc(n * 3 * sizeof(double));
  auv = (double*)my_malloc(n * 3 * sizeof(double));
  tv = (double*)my_malloc(n * 3 * sizeof(double));
  zv = (double*)my_malloc(n * 3 * sizeof(double));
  xv = (double*)my_malloc(n * 3 * sizeof(double));
  av = (double*)my_malloc(am->extn * 3 * sizeof(double));
  ev = (double*)my_malloc(em->extn * 3 * sizeof(double));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  time_lap(10);
  
  decompose(am, em, my_rank, am_rwset_addr, em_rwset_addr, local_barrier, pnum);
  
  time_lap(11);
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  time_lap(12);
  
  if(my_rank == 0)
    {
      outn("=== ILU(k) decomposition ===");
    }
  
  mm = make_mm(em, fillin);
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  /*
x(0) = 0
r(0) = b
r2 = r(0) or random
mr(0) = M^-1 * r(0)
amr(0) = A * mr(0)
p(0) = mr(0)
ap(0) = amr(0)
rho(0) = <r2,r(0)>
for k=1,2,...
   tmpdot0 = <ap(k-1),r2>
   alpha   = rho(k-1) / tmpdot0
   tmpdot0 = <y(k-1),y(k-1)>
   tmpdot1 = <amr(k-1),r(k-1)>
   tmpdot2 = <y(k-1),r(k-1)>
   tmpdot3 = <amr(k-1),y(k-1)>
   tmpdot4 = <amr(k-1),amr(k-1)>
   tmp     = tmpdot4*tmpdot0-tmpdot3*tmpdot3
   qsi     = (tmpdot0*tmpdot1-tmpdot2*tmpdot3) / tmp
   eta     = (tmpdot4*tmpdot2-tmpdot3*tmpdot1) / tmp
   t(k)  = qsi*ap(k-1) + eta*y(k-1)
   mt(k) = M^-1 * t(k)
   u(k)  = mt(k) + eta*beta*u(k-1)
   au(k) = A * u(k)
   z(k)  = qsi*mr(k-1) + eta*z(k-1) - alpha*u(k)
   y(k)    = qsi*amr(k-1) + eta*y(k-1) - alpha*au(k)
   x(k)    = x(k-1) + alpha*p(k-1) + z(k)
   r(k)    = r(k-1) - alpha*ap(k-1) - y(k)
   mr(k)   = M^-1 * r(k)
   amr(k)  = A * mr(k)
   rho(k)  = <r2,r(k)>
   beta    = (rho(k) / rho(k-1)) * (alpha / qsi)
   p(k)    = mr(k) + beta*(p(k-1) - u(k))
   ap(k) = amr(k) + beta*(ap(k-1) - au(k))
*/
  
  initial(bv, xv, rv, r2v, n);
  
  bv_norm = norm(bv, n, local_barrier, pnum);
  
  if(my_rank == 0)
    {
      outn("=== BiCGSafe iterations ===");
    }

#if DEBUG
  rv_norm = norm(bv, n, local_barrier, pnum);
  if(my_rank == 0)
    outn("bv_norm = %.12e", rv_norm);
  matvec(am, bv, tv, av, local_barrier, pnum);
  matvec(am, bv, tv, av, local_barrier, pnum);
  rv_norm = norm(tv, n, local_barrier, pnum);
  if(my_rank == 0)
    outn("Ab_norm = %.12e", rv_norm);
  //sleep(2);
#endif
    
  matvec(am, r2v, tv, av, local_barrier, pnum);
  matvec(am, tv, r2v, av, local_barrier, pnum);
  
  fwbk(mm, rv, mrv, ev, local_barrier, pnum);
  
  matvec(am, mrv, amrv, av, local_barrier, pnum);
  
  copy(pv, mrv, n);
  copy(apv, amrv, n);
  
  rho = dot(r2v, rv, n, local_barrier, pnum);
  beta = 0;
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  time_lap(13);
  
  for(iter = 0; iter < scaleunit.niter; iter++)
    {
      time_lap(50);
      
      tmpdot0 = dot(apv, r2v, n, local_barrier, pnum);
      
      rho_old = rho;
      alpha = rho_old / tmpdot0;
      
      if(iter == 0)
        {
          tmpdot1 = dot(amrv, rv, n, local_barrier, pnum);
          tmpdot4 = dot(amrv, amrv, n, local_barrier, pnum);
          
          qsi = tmpdot1 / tmpdot4;
          eta = 0;
        }
      else
        {
          tmpdot0 = dot(yv, yv, n, local_barrier, pnum);
          tmpdot1 = dot(amrv, rv, n, local_barrier, pnum);
          tmpdot2 = dot(yv, rv, n, local_barrier, pnum);
          tmpdot3 = dot(amrv, yv, n, local_barrier, pnum);
          tmpdot4 = dot(amrv, amrv, n, local_barrier, pnum);
          
          tmp = tmpdot4 * tmpdot0 - tmpdot3 * tmpdot3;
          qsi = (tmpdot0 * tmpdot1 - tmpdot2 * tmpdot3) / tmp;
          eta = (tmpdot4 * tmpdot2 - tmpdot3 * tmpdot1) / tmp;
        }
      
      daxpy4(tv, apv, yv, qsi, eta, n);
      
      fwbk(mm, tv, mtv, ev, local_barrier, pnum);
      
      daxpy4(uv, mtv, uv, 1, eta * beta, n);
      
      matvec(am, uv, auv, av, local_barrier, pnum);
      
      daxpy5(zv, mrv, uv, qsi, eta, -alpha, n);
      
      daxpy5(yv, amrv, auv, qsi, eta, -alpha, n);
      
      daxpy2(xv, pv, zv, alpha, 1, n);
      
      daxpy2(rv, apv, yv, -alpha, -1, n);
      
      fwbk(mm, rv, mrv, ev, local_barrier, pnum);
      
      matvec(am, mrv, amrv, av, local_barrier, pnum);
      
      rho = dot(r2v, rv, n, local_barrier, pnum);
      
      beta = (rho / rho_old) * (alpha / qsi);
      
      daxpy3(pv, mrv, uv, beta, 1, n);
      
      daxpy3(apv, amrv, auv, beta, 1, n);
      
      rv_norm = norm(rv, n, local_barrier, pnum);
      res = sqrt(rv_norm / bv_norm);
      
      if(my_rank == 0)
        {
          outn("iteration=%d time=%lf res=%.12e", iter, time_diff(50), res);
        }
      if(res < EPS)
        {
          break;
        }
    }
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  time_lap(14);
  
  matvec(am, xv, tv, av, local_barrier, pnum);
  daxpy(uv, bv, tv, -1, n);
  bax_norm = norm(uv, n, local_barrier, pnum);
  rv_norm = norm(rv, n, local_barrier, pnum);
  xv_norm = norm(xv, n, local_barrier, pnum);
  
  if(my_rank == 0)
    {
      outn("=========================================");
      outn("cg iteration    = %d", iter);
      outn("decompose time  = %.12lf sec", time_ref(11) - time_ref(10));
      outn("rwset time      = %.12lf sec", time_ref(12) - time_ref(11));
      outn("ilu time        = %.12lf sec", time_ref(13) - time_ref(12));
      outn("bicg time       = %.12lf sec", time_ref(14) - time_ref(13));
      outn("total time      = %.12lf sec", time_ref(14) - time_ref(12));
      outn("|b-Ax|/|b|      = %.12e", sqrt(bax_norm / bv_norm));
      outn("|r|/|b|         = %.12e", sqrt(rv_norm / bv_norm));
      outn("|x|^2           = %.12e", xv_norm);
      outn("=========================================");
    }
  
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t_dot, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    outn("dot time        = %.12lf", t_sum / pnum);
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t_matvec, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    outn("matvec time     = %.12lf", t_sum / pnum);
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t_fwbk, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    outn("fwbk time       = %.12lf", t_sum / pnum);
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t_barr, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    outn("barr time       = %.12lf", t_sum / pnum);
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t_comm, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    outn("comm time       = %.12lf", t_sum / pnum);
  if(my_rank == 0)
    outn("=========================================");
  
  print_calctime((time_ref(14) - time_ref(13)) / iter, t_calc / iter, local_barrier, my_rank, pnum);
  
  free_mm(mm);
  
  my_free(ev);
  my_free(av);
  my_free(xv);
  my_free(zv);
  my_free(tv);
  my_free(auv);
  my_free(uv);
  my_free(mtv);
  my_free(yv);
  my_free(apv);
  my_free(pv);
  my_free(amrv);
  my_free(mrv);
  my_free(r2v);
  my_free(rv);
  
  free_data(data);
  
  catch(DMI_local_barrier_destroy(local_barrier));
  
  my_free(local_barrier);
  return 0;
}

void decompose(am_t *am, am_t *em, int my_rank, int64_t am_rwset_addr, int64_t em_rwset_addr, DMI_local_barrier_t *local_barrier, int pnum)
{
  int32_t i, read_element_num, write_element_num;
  int64_t *write_elements, *read_elements;
  
  write_element_num = am->n;
  read_element_num = am->extn;
  
  write_elements = (int64_t*)my_malloc(sizeof(int64_t) * write_element_num);
  read_elements = (int64_t*)my_malloc(sizeof(int64_t) * read_element_num);
  
  for(i = 0; i < write_element_num; i++)
    {
      write_elements[i] = am->to[am->inner[i]];
    }
  for(i = 0; i < read_element_num; i++)
    {
      read_elements[i] = am->to[i];
    }
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_rwset_decompose(am_rwset_addr, my_rank, write_elements, write_element_num));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_local_rwset_init(am->local_rwset, am_rwset_addr, my_rank, read_elements, read_element_num));
  
  my_free(read_elements);
  my_free(write_elements);
  
  write_element_num = em->n;
  read_element_num = em->extn;
  
  write_elements = (int64_t*)my_malloc(sizeof(int64_t) * write_element_num);
  read_elements = (int64_t*)my_malloc(sizeof(int64_t) * read_element_num);
  
  for(i = 0; i < write_element_num; i++)
    {
      write_elements[i] = em->to[em->inner[i]];
    }
  for(i = 0; i < read_element_num; i++)
    {
      read_elements[i] = em->to[i];
    }
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_rwset_decompose(em_rwset_addr, my_rank, write_elements, write_element_num));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_local_rwset_init(em->local_rwset, em_rwset_addr, my_rank, read_elements, read_element_num));
  
  my_free(read_elements);
  my_free(write_elements);
  return;
}

data_t* make_data(char *dataset, int my_rank, int pnum)
{
  data_t *data;
  am_t *am, *em;
  double *bv;
  FILE *fp;
  char file[FNAME_SIZE];
  int ret, num, tmp, marker, rank;
  int sizes[pnum];
  
  data = (data_t*)my_malloc(sizeof(data_t));
  
  sprintf(file, "%s/%s_%d_%d.img", DIR, dataset, my_rank, pnum);
  fp = fopen(file, "rb");
  if(fp == NULL) error();
  
  am = (am_t*)my_malloc(sizeof(am_t));
  am->local_rwset = my_malloc(sizeof(DMI_local_rwset_t));
  
  num = 1;
  ret = fread(&am->n, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = 1;
  ret = fread(&am->extn, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = 1;
  ret = fread(&am->nplu, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  
  am->rows = (int*)my_malloc((am->n + 1) * sizeof(int));
  am->cols = (int*)my_malloc(am->nplu * sizeof(int));
  am->ds = (double*)my_malloc(am->nplu * 9 * sizeof(double));
  am->to = (int*)my_malloc(am->extn * sizeof(int));
  am->inner = (int*)my_malloc(am->n * sizeof(int));
  
  num = am->n + 1;
  ret = fread(am->rows, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = am->nplu;
  ret = fread(am->cols, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = am->nplu * 9;
  ret = fread(am->ds, sizeof(double), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = am->extn;
  ret = fread(am->to, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = am->n;
  ret = fread(am->inner, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  
  num = pnum;
  ret = fread(sizes, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  tmp = 0;
  for(rank = 0; rank < pnum; rank++)
    {
      tmp += sizes[rank];
    }
  ret = fseek(fp, tmp * sizeof(int), SEEK_CUR);
  if(ret < 0) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  
  num = pnum;
  ret = fread(sizes, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  tmp = 0;
  for(rank = 0; rank < pnum; rank++)
    {
      tmp += sizes[rank];
    }
  ret = fseek(fp, tmp * sizeof(int), SEEK_CUR);
  if(ret < 0) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  assert(tmp == am->extn - am->n);
  
  em = (am_t*)my_malloc(sizeof(am_t));
  em->local_rwset = my_malloc(sizeof(DMI_local_rwset_t));
  
  num = 1;
  ret = fread(&em->n, sizeof(int), num, fp);
  if(ret != num) error();
  assert(am->n == em->n);
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = 1;
  ret = fread(&em->extn, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = 1;
  ret = fread(&em->nplu, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  
  em->rows = (int*)my_malloc((em->extn + 1) * sizeof(int));
  em->cols = (int*)my_malloc(em->nplu * sizeof(int));
  em->ds = (double*)my_malloc(em->nplu * 9 * sizeof(double));
  em->to = (int*)my_malloc(em->extn * sizeof(int));
  em->inner = (int*)my_malloc(em->n * sizeof(int));
  
  num = em->extn + 1;
  ret = fread(em->rows, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = em->nplu;
  ret = fread(em->cols, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = em->nplu * 9;
  ret = fread(em->ds, sizeof(double), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = em->extn;
  ret = fread(em->to, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  num = em->n;
  ret = fread(em->inner, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  
  num = pnum;
  ret = fread(sizes, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  tmp = 0;
  for(rank = 0; rank < pnum; rank++)
    {
      tmp += sizes[rank];
    }
  ret = fseek(fp, tmp * sizeof(int), SEEK_CUR);
  if(ret < 0) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  
  num = pnum;
  ret = fread(sizes, sizeof(int), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  tmp = 0;
  for(rank = 0; rank < pnum; rank++)
    {
      tmp += sizes[rank];
    }
  ret = fseek(fp, tmp * sizeof(int), SEEK_CUR);
  if(ret < 0) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER) error();
#endif
  assert(tmp == em->extn - em->n);
  
  bv = (double*)my_malloc(am->n * 3 * sizeof(double));
  num = am->n * 3;
  ret = fread(bv, sizeof(double), num, fp);
  if(ret != num) error();
#if MARKER
  num = 1; ret = fread(&marker, sizeof(int), num, fp); if(ret != num) error(); if(marker != MARKER_END) error();
#endif
  fclose(fp);
  
  data->am = am;
  data->em = em;
  data->bv = bv;
  return data;
}

void free_data(data_t *data)
{
  am_t *am, *em;
  
  my_free(data->bv);
  em = data->em;
  catch(DMI_local_rwset_destroy(em->local_rwset));
  my_free(em->local_rwset);
  my_free(em->inner);
  my_free(em->to);
  my_free(em->ds);
  my_free(em->cols);
  my_free(em->rows);
  my_free(em);
  am = data->am;
  catch(DMI_local_rwset_destroy(am->local_rwset));
  my_free(am->local_rwset);
  my_free(am->inner);
  my_free(am->to);
  my_free(am->ds);
  my_free(am->cols);
  my_free(am->rows);
  my_free(am);
  my_free(data);
  return;
}

mm_t* make_mm(am_t *em, int fillin)
{
  mm_t *mm;
  int i, j, jj, k, extn, lev, colmin, linc, uinc, m, col, jmin, tmp;
  int *levs, *levline, *colline, *jline, *lcols, *lrows, *ucols, *urows;
  double d0, d1, d2, d3, d4, d5, d6, d7, d8, inv_det;
  double *dlds, *duds, *dd_p, *dc_p, *dl_p, *du_p, *d_p, *d2_p, *dline;
  double *lds, *uds, *fd_p;
  double dtmp[9];
  
  extn = em->extn;
  
#if REALLOC
  levs = (int*)my_malloc(extn * sizeof(int));
#else
  levs = (int*)my_malloc(extn * 200 * sizeof(int));
#endif
  levline = (int*)my_malloc(extn * sizeof(int));
  colline = (int*)my_malloc(extn * sizeof(int));
  dline = (double*)my_malloc(extn * 9 * sizeof(double));
  jline = (int*)my_malloc(extn * sizeof(int));
  
#if REALLOC
  dlds = NULL;
  lcols = NULL;
  duds = NULL;
  ucols = NULL;
#else
  dlds = (double*)my_malloc(extn * 200 * 9 * sizeof(double));
  lcols = (int*)my_malloc(extn * 200 * sizeof(int));
  duds = (double*)my_malloc(extn * 200 * 9 * sizeof(double));
  ucols = (int*)my_malloc(extn * 200 * sizeof(int));
#endif
  lrows = (int*)my_malloc((extn + 1) * sizeof(int));
  urows = (int*)my_malloc((extn + 1) * sizeof(int));
  
  for(i = 0; i < extn; i++)
    {
      jline[i] = -1;
    }
  urows[0] = 0;
  lrows[0] = 0;
  
  for(i = 0; i < extn; i++)
    {
      linc = 0;
      uinc = i;
      
      for(j = em->rows[i]; j < em->rows[i + 1]; j++)
        {
          col = em->cols[j];
          if(0 <= col && col < i)
            {
              colline[linc] = col;
              levline[linc] = 0;
              d2_p = &dline[linc * 9];
              d_p = &em->ds[j * 9];
              for(m = 0; m < 9; m++)
                d2_p[m] = d_p[m];
              jline[col] = linc++;
            }
          else if(i <= col && col < extn)
            {
              colline[uinc] = col;
              levline[uinc] = 0;
              d2_p = &dline[uinc * 9];
              d_p = &em->ds[j * 9];
              for(m = 0; m < 9; m++)
                d2_p[m] = d_p[m];
              jline[col] = uinc++;
            }
        }
      
      for(j = 0; j < linc; j++)
        {
          col = colline[j];
          colmin = col;
          jmin = j;
          for(k = j + 1; k < linc; k++)
            {
              if(colline[k] < colmin)
                {
                  colmin = colline[k];
                  jmin = k;
                }
            }
          
          if(jmin != j)
            {
              colline[j] = colmin;
              colline[jmin] = col;
              jline[colmin] = j;
              jline[col] = jmin;
              tmp = levline[j];
              levline[j] = levline[jmin];
              levline[jmin] = tmp;
              d_p = &dline[j * 9];
              for(m = 0; m < 9; m++)
                dtmp[m] = d_p[m];
              d2_p = &dline[j * 9];
              d_p = &dline[jmin * 9];
              for(m = 0; m < 9; m++)
                d2_p[m] = d_p[m];
              d_p = &dline[jmin * 9];
              for(m = 0; m < 9; m++)
                d_p[m] = dtmp[m];
              col = colmin;
            }
          
          for(k = urows[col]; k < urows[col + 1]; k++)
            {
              jj = jline[ucols[k]];
              lev = levs[k] + levline[j] + 1;
              if(lev > fillin)
                {
                  continue;
                }
              if(jj == -1)
                {
                  if(ucols[k] < i)
                    {
                      colline[linc] = ucols[k];
                      levline[linc] = lev;
                      d_p = &dline[linc * 9];
                      for(m = 0; m < 9; m++)
                        d_p[m] = 0;
                      jline[ucols[k]] = linc++;
                    }
                  else if(ucols[k] >= i)
                    {
                      colline[uinc] = ucols[k];
                      levline[uinc] = lev;
                      d_p = &dline[uinc * 9];
                      for(m = 0; m < 9; m++)
                        d_p[m] = 0;
                      jline[ucols[k]] = uinc++;
                    }
                }
              else
                {
                  if(lev < levline[jj])
                    {
                      levline[jj] = lev;
                    }
                }
            }
        }
      
      for(j = 0; j < linc; j++)
        {
          jline[colline[j]] = -1;
        }
      for(j = i; j < uinc; j++)
        {
          jline[colline[j]] = -1;
        }
      
      lrows[i + 1] = lrows[i] + linc;
      if(linc > 0)
        {
#if REALLOC
          lcols = (int*)my_realloc(lcols, lrows[i + 1] * sizeof(int));
          dlds = (double*)my_realloc(dlds, lrows[i + 1] * 9 * sizeof(double));
#endif
          memcpy(lcols + lrows[i], colline, linc * sizeof(int));
          memcpy(dlds + lrows[i] * 9, dline, linc * 9 * sizeof(double));
        }
      
      k = uinc - i;
      urows[i + 1] = urows[i] + k;
      if(k > 0)
        {
#if REALLOC
          ucols = (int*)my_realloc(ucols, urows[i + 1] * sizeof(int));
          duds = (double*)my_realloc(duds, urows[i + 1] * 9 * sizeof(double));
          levs = (int*)my_realloc(levs, urows[i + 1] * sizeof(int));
#endif
          memcpy(ucols + urows[i], colline + i, k * sizeof(int));
          memcpy(levs + urows[i], levline + i, k * sizeof(int));
          memcpy(duds + urows[i] * 9, dline + i * 9, k * 9 * sizeof(double));
        }
    }
  
  my_free(dline);
  my_free(colline);
  my_free(levline);
  my_free(levs);
  
#if DEBUG
  double sum = 0;
  for(i = 0; i < extn; i++)
    {
      for(j = em->rows[i]; j < em->rows[i + 1]; j++)
        {
          d_p = &em->ds[j * 9];
          for(k = 0; k < 9; k++)
            {
              sum += d_p[k];
            }
        }
    }
  outn("em : sum=%.12e", sum);
#endif
  
  for(j = 0; j < extn; j++)
    {
      jline[j] = -1;
    }
  for(i = 0; i < extn; i++)
    {
      for(k = lrows[i]; k < lrows[i + 1]; k++)
        {
          dd_p = &duds[urows[lcols[k]] * 9];
          dl_p = &dlds[k * 9];
          
          d0 = dl_p[0]; d1 = dl_p[1]; d2 = dl_p[2];
          d3 = dl_p[3]; d4 = dl_p[4]; d5 = dl_p[5];
          d6 = dl_p[6]; d7 = dl_p[7]; d8 = dl_p[8];
          dl_p[0] = d0 * dd_p[0] + d1 * dd_p[3] + d2 * dd_p[6];
          dl_p[1] = d0 * dd_p[1] + d1 * dd_p[4] + d2 * dd_p[7];
          dl_p[2] = d0 * dd_p[2] + d1 * dd_p[5] + d2 * dd_p[8];
          dl_p[3] = d3 * dd_p[0] + d4 * dd_p[3] + d5 * dd_p[6];
          dl_p[4] = d3 * dd_p[1] + d4 * dd_p[4] + d5 * dd_p[7];
          dl_p[5] = d3 * dd_p[2] + d4 * dd_p[5] + d5 * dd_p[8];
          dl_p[6] = d6 * dd_p[0] + d7 * dd_p[3] + d8 * dd_p[6];
          dl_p[7] = d6 * dd_p[1] + d7 * dd_p[4] + d8 * dd_p[7];
          dl_p[8] = d6 * dd_p[2] + d7 * dd_p[5] + d8 * dd_p[8];
          
          for(j = urows[lcols[k]]; j < urows[lcols[k] + 1]; j++)
            {
              jline[ucols[j]] = j;
            }
          for(j = k + 1; j < lrows[i + 1]; j++)
            {
              if(jline[lcols[j]] != -1)
                {
                  dc_p = &dlds[j * 9];
                  du_p = &duds[jline[lcols[j]] * 9];
                  
                  dc_p[0] -= dl_p[0] * du_p[0] + dl_p[1] * du_p[3] + dl_p[2] * du_p[6];
                  dc_p[1] -= dl_p[0] * du_p[1] + dl_p[1] * du_p[4] + dl_p[2] * du_p[7];
                  dc_p[2] -= dl_p[0] * du_p[2] + dl_p[1] * du_p[5] + dl_p[2] * du_p[8];
                  dc_p[3] -= dl_p[3] * du_p[0] + dl_p[4] * du_p[3] + dl_p[5] * du_p[6];
                  dc_p[4] -= dl_p[3] * du_p[1] + dl_p[4] * du_p[4] + dl_p[5] * du_p[7];
                  dc_p[5] -= dl_p[3] * du_p[2] + dl_p[4] * du_p[5] + dl_p[5] * du_p[8];
                  dc_p[6] -= dl_p[6] * du_p[0] + dl_p[7] * du_p[3] + dl_p[8] * du_p[6];
                  dc_p[7] -= dl_p[6] * du_p[1] + dl_p[7] * du_p[4] + dl_p[8] * du_p[7];
                  dc_p[8] -= dl_p[6] * du_p[2] + dl_p[7] * du_p[5] + dl_p[8] * du_p[8];
                }
            }
          for(j = urows[i]; j < urows[i + 1]; j++)
            {
              if(jline[ucols[j]] != -1)
                {
                  dc_p = &duds[j * 9];
                  du_p = &duds[jline[ucols[j]] * 9];
                  
                  dc_p[0] -= dl_p[0] * du_p[0] + dl_p[1] * du_p[3] + dl_p[2] * du_p[6];
                  dc_p[1] -= dl_p[0] * du_p[1] + dl_p[1] * du_p[4] + dl_p[2] * du_p[7];
                  dc_p[2] -= dl_p[0] * du_p[2] + dl_p[1] * du_p[5] + dl_p[2] * du_p[8];
                  dc_p[3] -= dl_p[3] * du_p[0] + dl_p[4] * du_p[3] + dl_p[5] * du_p[6];
                  dc_p[4] -= dl_p[3] * du_p[1] + dl_p[4] * du_p[4] + dl_p[5] * du_p[7];
                  dc_p[5] -= dl_p[3] * du_p[2] + dl_p[4] * du_p[5] + dl_p[5] * du_p[8];
                  dc_p[6] -= dl_p[6] * du_p[0] + dl_p[7] * du_p[3] + dl_p[8] * du_p[6];
                  dc_p[7] -= dl_p[6] * du_p[1] + dl_p[7] * du_p[4] + dl_p[8] * du_p[7];
                  dc_p[8] -= dl_p[6] * du_p[2] + dl_p[7] * du_p[5] + dl_p[8] * du_p[8];
                }
            }
          for(j = urows[lcols[k]]; j < urows[lcols[k] + 1]; j++)
            {
              jline[ucols[j]] = -1;
            }
        }
      
      dd_p = &duds[urows[i] * 9];
                  
      d0 = dd_p[0]; d1 = dd_p[1]; d2 = dd_p[2];
      d3 = dd_p[3]; d4 = dd_p[4]; d5 = dd_p[5];
      d6 = dd_p[6]; d7 = dd_p[7]; d8 = dd_p[8];
      inv_det = 1 / (d0 * d4 * d8 + d2 * d3 * d7 + d1 * d5 * d6
                     - d0 * d5 * d7 - d2 * d4 * d6 - d1 * d3 * d8);
      dd_p[0] = (d4 * d8 - d5 * d7) * inv_det;
      dd_p[1] = (d2 * d7 - d1 * d8) * inv_det;
      dd_p[2] = (d1 * d5 - d2 * d4) * inv_det;
      dd_p[3] = (d5 * d6 - d3 * d8) * inv_det;
      dd_p[4] = (d0 * d8 - d2 * d6) * inv_det;
      dd_p[5] = (d2 * d3 - d0 * d5) * inv_det;
      dd_p[6] = (d3 * d7 - d4 * d6) * inv_det;
      dd_p[7] = (d1 * d6 - d0 * d7) * inv_det;
      dd_p[8] = (d0 * d4 - d1 * d3) * inv_det;
    }
  
  my_free(jline);
  
#if DEBUG
  sum = 0;
  for(i = 0; i < extn; i++)
    {
      for(j = lrows[i]; j < lrows[i + 1]; j++)
        {
          d_p = &dlds[j * 9];
          for(k = 0; k < 9; k++)
            {
              sum += d_p[k];
            }
        }
      for(j = urows[i]; j < urows[i + 1]; j++)
        {
          d_p = &duds[j * 9];
          for(k = 0; k < 9; k++)
            {
              sum += d_p[k];
            }
        }
    }
  outn("mm : sum=%.12e", sum);
#endif
  
  lds = (double*)my_malloc(lrows[extn] * 9 * sizeof(double));
  uds = (double*)my_malloc(urows[extn] * 9 * sizeof(double));
  for(i = 0; i < extn; i++)
    {
      for(j = lrows[i]; j < lrows[i + 1]; j++)
        {
          fd_p = &lds[j * 9];
          d_p = &dlds[j * 9];
          for(k = 0; k < 9; k++)
            {
              fd_p[k] = d_p[k];
            }
        }
      for(j = urows[i]; j < urows[i + 1]; j++)
        {
          fd_p = &uds[j * 9];
          d_p = &duds[j * 9];
          for(k = 0; k < 9; k++)
            {
              fd_p[k] = d_p[k];
            }
        }
    }
  
  my_free(duds);
  my_free(dlds);
  
  mm = (mm_t*)my_malloc(sizeof(mm_t));
  mm->extn = extn;
  mm->n = em->n;
  mm->lrows = lrows;
  mm->lcols = lcols;
  mm->lds = lds;
  mm->urows = urows;
  mm->ucols = ucols;
  mm->uds = uds;
  mm->to = em->to;
  mm->inner = em->inner;
  mm->local_rwset = em->local_rwset;
  return mm;
}

void free_mm(mm_t *mm)
{
  my_free(mm->uds);
  my_free(mm->ucols);
  my_free(mm->urows);
  my_free(mm->lds);
  my_free(mm->lcols);
  my_free(mm->lrows);
  my_free(mm);
  return;
}

void matvec(am_t *am, double *pv, double *qv, double *av, DMI_local_barrier_t *local_barrier, int32_t pnum)
{
  int i, j, n, extn;
  int *cols, *rows;
  double s0, s1, s2;
  double *av_p, *d_p, *qv_p, *ds;
  
  n = am->n;
  extn = am->extn;
  rows = am->rows;
  cols = am->cols;
  ds = am->ds;
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  time_lap(1);
  
  catch(DMI_local_rwset_write(am->local_rwset, pv, NULL));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_local_rwset_read(am->local_rwset, av, NULL));
  
  t_comm += time_diff(1);
  time_lap(1);
  time_lap(60);
  
  d_p = ds;
  qv_p = qv;
  for(i = 0; i < n; i++)
    {
      s0 = 0; s1 = 0; s2 = 0;
      for(j = rows[i]; j < rows[i + 1]; j++)
        {
          av_p = &av[cols[j] * 3];
          s0 += d_p[0] * av_p[0] + d_p[1] * av_p[1] + d_p[2] * av_p[2];
          s1 += d_p[3] * av_p[0] + d_p[4] * av_p[1] + d_p[5] * av_p[2];
          s2 += d_p[6] * av_p[0] + d_p[7] * av_p[1] + d_p[8] * av_p[2];
          d_p += 9;
        }
      qv_p[0] = s0; qv_p[1] = s1; qv_p[2] = s2;
      qv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_matvec += time_diff(1);
  return;
}

void fwbk(mm_t *mm, double *rv, double *zv, double *ev, DMI_local_barrier_t *local_barrier, int32_t pnum)
{
  int i, j, n, extn;
  int *lcols, *lrows, *ucols, *urows, *inner;
  double s0, s1, s2;
  double *ev_p, *ev_pp, *zv_p;
  double *d_p, *lds, *uds;
  
  n = mm->n;
  extn = mm->extn;
  lrows = mm->lrows;
  urows = mm->urows;
  lcols = mm->lcols;
  ucols = mm->ucols;
  lds = mm->lds;
  uds = mm->uds;
  inner = mm->inner;
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  time_lap(1);
  
  catch(DMI_local_rwset_write(mm->local_rwset, rv, NULL));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_local_rwset_read(mm->local_rwset, ev, NULL));
  
  t_comm += time_diff(1);
  time_lap(1);
  time_lap(60);
  
  ev_p = ev;
  d_p = lds;
  for(i = 0; i < extn; i++)
    {
      s0 = ev_p[0]; s1 = ev_p[1]; s2 = ev_p[2];
      for(j = lrows[i]; j < lrows[i + 1]; j++)
        {
#if ASSERT
          assert(0 <= lcols[j] && lcols[j] < i);
#endif
          ev_pp = &ev[lcols[j] * 3];
          s0 -= d_p[0] * ev_pp[0] + d_p[1] * ev_pp[1] + d_p[2] * ev_pp[2];
          s1 -= d_p[3] * ev_pp[0] + d_p[4] * ev_pp[1] + d_p[5] * ev_pp[2];
          s2 -= d_p[6] * ev_pp[0] + d_p[7] * ev_pp[1] + d_p[8] * ev_pp[2];
          d_p += 9;
        }
      ev_p[0] = s0; ev_p[1] = s1; ev_p[2] = s2;
      ev_p += 3;
    }
  
  ev_p = &ev[extn * 3 - 3];
  d_p = &uds[urows[extn] * 9 - 9];
  for(i = extn - 1; i >= 0; i--)
    {
      s0 = ev_p[0]; s1 = ev_p[1]; s2 = ev_p[2];
      for(j = urows[i + 1] - 1; j > urows[i]; j--)
        {
#if ASSERT
          assert(i < ucols[j] && ucols[j] <= extn - 1);
#endif
          ev_pp = &ev[ucols[j] * 3];
          s0 -= d_p[0] * ev_pp[0] + d_p[1] * ev_pp[1] + d_p[2] * ev_pp[2];
          s1 -= d_p[3] * ev_pp[0] + d_p[4] * ev_pp[1] + d_p[5] * ev_pp[2];
          s2 -= d_p[6] * ev_pp[0] + d_p[7] * ev_pp[1] + d_p[8] * ev_pp[2];
          d_p -= 9;
        }
      
      ev_p[0] = d_p[0] * s0 + d_p[1] * s1 + d_p[2] * s2;
      ev_p[1] = d_p[3] * s0 + d_p[4] * s1 + d_p[5] * s2;
      ev_p[2] = d_p[6] * s0 + d_p[7] * s1 + d_p[8] * s2;
      d_p -= 9;
      j--;
      
      ev_p -= 3;
    }
  
  zv_p = zv;
  for(i = 0; i < n; i++)
    {
      ev_p = &ev[inner[i] * 3];
      zv_p[0] = ev_p[0]; zv_p[1] = ev_p[1]; zv_p[2] = ev_p[2];
      zv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_fwbk += time_diff(1);
  return;
}

void initial(double *bv, double *xv, double *rv, double *r2v, int n)
{
  int i;
  double *xv_p, *bv_p, *rv_p, *r2v_p;

  bv_p = bv;
  xv_p = xv;
  rv_p = rv;
  r2v_p = r2v;
  for(i = 0; i < n; i++)
    {
      xv_p[0] = 0; xv_p[1] = 0; xv_p[2] = 0;
      rv_p[0] = bv_p[0]; rv_p[1] = bv_p[1]; rv_p[2] = bv_p[2];
      r2v_p[0] = 1; r2v_p[1] = 1; r2v_p[2] = 1;
      bv_p += 3; xv_p += 3; rv_p += 3; r2v_p += 3;
    }
  return;
}

void daxpy(double *pv, double *qv, double *rv, double scale, int n)
{
  int i;
  double *pv_p, *qv_p, *rv_p;
  
  time_lap(1);
  time_lap(60);
  
  pv_p = pv;
  qv_p = qv;
  rv_p = rv;
  for(i = 0; i < n; i++)
    {
      pv_p[0] = qv_p[0] + scale * rv_p[0];
      pv_p[1] = qv_p[1] + scale * rv_p[1];
      pv_p[2] = qv_p[2] + scale * rv_p[2];
      pv_p += 3; qv_p += 3; rv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_dot += time_diff(1);
  return;
}

void daxpy2(double *xv, double *p2v, double *s2v, double alpha, double omega, int n)
{
  int i;
  double *xv_p, *p2v_p, *s2v_p;
  
  time_lap(1);
  time_lap(60);
  
  xv_p = xv;
  p2v_p = p2v;
  s2v_p = s2v;
  for(i = 0; i < n; i++)
    {
      xv_p[0] += alpha * p2v_p[0] + omega * s2v_p[0];
      xv_p[1] += alpha * p2v_p[1] + omega * s2v_p[1];
      xv_p[2] += alpha * p2v_p[2] + omega * s2v_p[2];
      xv_p += 3; p2v_p += 3; s2v_p += 3;
    }
  
  t_calc += time_diff(60);
  t_dot += time_diff(1);
  return;
}

void daxpy3(double *pv, double *rv, double *qv, double beta, double omega, int n)
{
  int i;
  double *pv_p, *rv_p, *qv_p;
  
  time_lap(1);
  time_lap(60);
  
  pv_p = pv;
  rv_p = rv;
  qv_p = qv;
  for(i = 0; i < n; i++)
    {
      pv_p[0] = rv_p[0] + beta * (pv_p[0] - omega * qv_p[0]);
      pv_p[1] = rv_p[1] + beta * (pv_p[1] - omega * qv_p[1]);
      pv_p[2] = rv_p[2] + beta * (pv_p[2] - omega * qv_p[2]);
      pv_p += 3; rv_p += 3; qv_p += 3;
    }

  t_calc += time_diff(60);
  t_dot += time_diff(1);
  return;
}

void daxpy4(double *pv, double *rv, double *qv, double beta, double omega, int n)
{
  int i;
  double *pv_p, *rv_p, *qv_p;
  
  time_lap(1);
  time_lap(60);
  
  pv_p = pv;
  rv_p = rv;
  qv_p = qv;
  for(i = 0; i < n; i++)
    {
      pv_p[0] = beta * rv_p[0] + omega * qv_p[0];
      pv_p[1] = beta * rv_p[1] + omega * qv_p[1];
      pv_p[2] = beta * rv_p[2] + omega * qv_p[2];
      pv_p += 3; rv_p += 3; qv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_dot += time_diff(1);
  return;
}

void daxpy5(double *pv, double *rv, double *qv, double beta, double omega, double alpha, int n)
{
  int i;
  double *pv_p, *rv_p, *qv_p;
  
  time_lap(1);
  time_lap(60);
  
  pv_p = pv;
  rv_p = rv;
  qv_p = qv;
  for(i = 0; i < n; i++)
    {
      pv_p[0] = beta * rv_p[0] + omega * pv_p[0] + alpha * qv_p[0];
      pv_p[1] = beta * rv_p[1] + omega * pv_p[1] + alpha * qv_p[1];
      pv_p[2] = beta * rv_p[2] + omega * pv_p[2] + alpha * qv_p[2];
      pv_p += 3; rv_p += 3; qv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_dot += time_diff(1);
  return;
}

void copy(double *pv, double *qv, int n)
{
  int i;
  double *pv_p, *qv_p;
  
  time_lap(60);
  pv_p = pv;
  qv_p = qv;
  for(i = 0; i < n; i++)
    {
      pv_p[0] = qv_p[0];
      pv_p[1] = qv_p[1];
      pv_p[2] = qv_p[2];
      pv_p += 3; qv_p += 3;
    }
  t_calc += time_diff(60);
  return;
}

double dot(double *pv, double *qv, int n, DMI_local_barrier_t *local_barrier, int32_t pnum)
{
  int i;
  double s, ss;
  double *pv_p, *qv_p;
  
  time_lap(1);
  time_lap(60);
  
  pv_p = pv;
  qv_p = qv;
  ss = 0;
  for(i = 0; i < n; i++)
    {
      ss += pv_p[0] * qv_p[0];
      ss += pv_p[1] * qv_p[1];
      ss += pv_p[2] * qv_p[2];
      pv_p += 3; qv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_dot += time_diff(1);
  time_lap(1);
  
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &ss, &s, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  
  t_barr += time_diff(1);
  return s;
}

double norm(double *pv, int n, DMI_local_barrier_t *local_barrier, int32_t pnum)
{
  int i;
  double s, ss;
  double *pv_p;
  
  time_lap(1);
  time_lap(60);
  
  pv_p = pv;
  ss = 0;
  for(i = 0; i < n; i++)
    {
      ss += pv_p[0] * pv_p[0];
      ss += pv_p[1] * pv_p[1];
      ss += pv_p[2] * pv_p[2];
      pv_p += 3;
    }
  
  t_calc += time_diff(60);
  t_dot += time_diff(1);
  time_lap(1);
  
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &ss, &s, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  
  t_barr += time_diff(1);
  return s;
}

void print_calctime(double total, double t, DMI_local_barrier_t *local_barrier, int my_rank, int pnum)
{
  double t_sum, t_max, t_min;

  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_max, DMI_OP_MAX, DMI_TYPE_DOUBLE));
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_min, DMI_OP_MIN, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    {
      outn("pnum=%d time=%.12lf calc_max=%.12lf calc_min=%.12lf calc_avg=%.12lf comm=%.12lf", 
           pnum, total, t_max, t_min, t_sum / pnum, total - t_max);
    }
  return;
}
