#include "dmi_api.h"

#define DIR "/data/1/haraken/graph/"
#define DEBUG0 0
#define DEBUG1 0

#define DEBUG_THREADMALLOC 0
#define DUMPING_FACTOR 0.85
#define CONST_FASTSORT_THRESHOLD 12

typedef struct graph_t
{
  int64_t total_n;
  int64_t n;
  int64_t nn;
  int64_t extn;
  int64_t nplu;
  int64_t begin_node_id;
  int64_t end_node_id;
  int64_t *inners;
  int64_t *cols;
  int64_t *rows;
  int64_t *outdegs;
  double *ds;
  double *values;
  double *ext_values;
  DMI_local_rwset_t local_rwset;
}graph_t;

typedef struct colindex_t
{
  int64_t global_col;
  int64_t index;
}colindex_t;

typedef struct scaleunit_t
{
  int32_t niter;
  int32_t part_num;
  int64_t barrier_addr;
  int64_t rwset_addr;
  int64_t outdegs_addr;
  char dataset[FNAME_SIZE];
}scaleunit_t;

__thread int64_t _used_memory = 0;
__thread int64_t _tmp_memory = 0;

__thread double t_calc = 0;

graph_t* graph_alloc(char *dataset, int32_t part_num, int64_t rwset_addr, int64_t outdegs_addr, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum);
void graph_free(graph_t *graph);
void graph_init(graph_t *graph, int32_t my_rank, int32_t pnum);
void graph_dummy_propagate(graph_t *graph, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum);
void graph_propagate(graph_t *graph, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum);
void graph_allreduce(graph_t *graph, double *sum_ptr, double *max_ptr, double *min_ptr, int8_t flag, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum);
int32_t calc_dst_rank(int32_t part, int32_t part_num, int32_t pnum);
void fastsort(int64_t n, colindex_t *colindices);
void set_array(int64_t *array, int64_t num, ...);
void print_array(int64_t *array, int64_t num, char *str, int32_t my_rank, int32_t pnum);
void print_time(double t, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum, char *str);
void print_calctime(double total, double t, DMI_local_barrier_t *local_barrier, int my_rank, int pnum);

void DMI_main(int argc, char **argv)
{
  FILE *fp;
  scaleunit_t scaleunit;
  char file[FNAME_SIZE];
  char *dataset;
  int32_t nn, part_num, ret, niter, init_node_num, thread_num;
  int64_t scaleunit_addr, barrier_addr, rwset_addr, outdegs_addr;
  double dummy;
  
  if(argc != 6)
    {
      outn("usage: %s init_node_num thread_num dataset part_num niter", argv[0]);
      error();
    }
  init_node_num = atoi(argv[1]);
  thread_num = atoi(argv[2]);
  dataset = argv[3];
  part_num = atoi(argv[4]);
  niter = atoi(argv[5]);
  if(part_num < init_node_num * thread_num)
    {
      outn("part_num < init_node_num * thread_num");
      error();
    }
  
  catch(DMI_mmap(&scaleunit_addr, sizeof(scaleunit_t), 1, NULL));
  catch(DMI_mmap(&rwset_addr, sizeof(DMI_rwset_t), 1, NULL));
  catch(DMI_mmap(&barrier_addr, sizeof(DMI_barrier_t), 1, NULL));
  
  sprintf(file, "%s/%s_0_%d.dat", DIR, dataset, part_num);
  fp = fopen(file, "rb");
  if(fp == NULL) error();
  ret = fread(&dummy, sizeof(double), 1, fp);
  if(ret != 1) error();
  ret = fread(&dummy, sizeof(double), 1, fp);
  if(ret != 1) error();
  ret = fread(&dummy, sizeof(double), 1, fp);
  if(ret != 1) error();
  ret = fread(&nn, sizeof(int64_t), 1, fp);
  if(ret != 1) error();
  fclose(fp);
  
  catch(DMI_mmap(&outdegs_addr, sizeof(int64_t) * nn, part_num, NULL));
#if DEBUG_THREADMALLOC
  outn("DMI_mmap: outdegs_addr=%ld MB", sizeof(int64_t) * nn * part_num / 1024.0 / 1024.0);
#endif
  catch(DMI_barrier_init(barrier_addr));
  
  scaleunit.niter = niter;
  scaleunit.barrier_addr = barrier_addr;
  scaleunit.part_num = part_num;
  scaleunit.rwset_addr = rwset_addr;
  scaleunit.outdegs_addr = outdegs_addr;
  strcpy(scaleunit.dataset, dataset);
  catch(DMI_write(scaleunit_addr, sizeof(scaleunit_t), &scaleunit, DMI_EXCLUSIVE, NULL));
  
  DMI_rescale(scaleunit_addr, init_node_num, thread_num);
  
  catch(DMI_barrier_destroy(barrier_addr));
  catch(DMI_munmap(outdegs_addr, NULL));
  catch(DMI_munmap(rwset_addr, NULL));
  catch(DMI_munmap(barrier_addr, NULL));
  catch(DMI_munmap(scaleunit_addr, NULL));
  return;
}

int32_t DMI_scaleunit(int my_rank, int pnum, int64_t scaleunit_addr)
{
  FILE *fp;
  scaleunit_t scaleunit;
  DMI_local_barrier_t local_barrier;
  graph_t *graph;
  char dataset[FNAME_SIZE], file[FNAME_SIZE];
  int32_t part_num, ret, iter;
  int64_t barrier_addr, rwset_addr, outdegs_addr, nn;
  double sum, max, min, dummy;
  
  catch(DMI_read(scaleunit_addr, sizeof(scaleunit_t), &scaleunit, DMI_GET, NULL));
  bind_to_cpu(my_rank % PROCNUM);
  
  part_num = scaleunit.part_num;
  barrier_addr = scaleunit.barrier_addr;
  rwset_addr = scaleunit.rwset_addr;
  outdegs_addr = scaleunit.outdegs_addr;
  strcpy(dataset, scaleunit.dataset);
  
  catch(DMI_local_barrier_init(&local_barrier, barrier_addr));
  
  if(my_rank == 0)
    {
      sprintf(file, "%s/%s_0_%d.dat", DIR, dataset, part_num);
      fp = fopen(file, "rb");
      if(fp == NULL) error();
      ret = fread(&dummy, sizeof(double), 1, fp);
      if(ret != 1) error();
      ret = fread(&dummy, sizeof(double), 1, fp);
      if(ret != 1) error();
      ret = fread(&dummy, sizeof(double), 1, fp);
      if(ret != 1) error();
      ret = fread(&nn, sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      fclose(fp);
      
      catch(DMI_rwset_init(rwset_addr, nn * part_num, sizeof(double), pnum));
    }
  
  catch(DMI_local_barrier_sync(&local_barrier, pnum));
  
  graph = graph_alloc(dataset, part_num, rwset_addr, outdegs_addr, &local_barrier, my_rank, pnum);
  catch(DMI_local_barrier_sync(&local_barrier, pnum));
  graph_init(graph, my_rank, pnum);
  if(my_rank == 0)
    {
      outn("_tmp_memory=%lf MB", _tmp_memory / 1024.0 / 1024.0);
      outn("_used_memory=%lf MB", _used_memory / 1024.0 / 1024.0);
    }
  catch(DMI_local_barrier_sync(&local_barrier, pnum));
  
  graph_dummy_propagate(graph, &local_barrier, my_rank, pnum);
  graph_dummy_propagate(graph, &local_barrier, my_rank, pnum);
  graph_dummy_propagate(graph, &local_barrier, my_rank, pnum);
  
  catch(DMI_local_barrier_sync(&local_barrier, pnum));
  
  time_lap(12);
  for(iter = 0; iter < scaleunit.niter; iter++)
    {
      time_lap(10);
      graph_propagate(graph, &local_barrier, my_rank, pnum);
      
      graph_allreduce(graph, &sum, &max, &min, FALSE, &local_barrier, my_rank, pnum);
      if(my_rank == 0)
        {
          outn("iteration=%d time=%.12lf sum=%.12lf max=%e min=%e", iter, time_diff(10), sum, max, min);
        }
      //halt(1);
    }
  
  time_lap(14);
  graph_allreduce(graph, &sum, &max, &min, TRUE, &local_barrier, my_rank, pnum);
  if(my_rank == 0)
    {
      outn("niter=%d sum=%.12lf max=%e min=%e", iter, sum, max, min);
    }
#if 0
  {
    int rank;
    for(rank = 0; rank < pnum; rank++)
      {
        catch(DMI_local_barrier_sync(&local_barrier, pnum));
        if(rank == my_rank)
          {
            outn("my_rank=%d n=%d extn=%d nplu=%d time=%.12lf", my_rank, graph->n, graph->extn, graph->nplu, t_calc / iter);
          }
      }
  }
#endif
  print_calctime((time_ref(14) - time_ref(12)) / scaleunit.niter, t_calc / scaleunit.niter, &local_barrier, my_rank, pnum);
  
  graph_free(graph);
  
  if(my_rank == 0)
    {
      catch(DMI_rwset_destroy(rwset_addr));
    }
  catch(DMI_local_barrier_destroy(&local_barrier));
  return 0;
}

graph_t* graph_alloc(char *dataset, int32_t part_num, int64_t rwset_addr, int64_t outdegs_addr, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum)
{
  DMI_local_group_t group;
  graph_t *graph;
  FILE *fp;
  char file[FNAME_SIZE];
  int32_t ret, rank, part;
  int32_t quota[pnum], cum_quota[pnum + 1];
  int64_t node_id, begin_node_id, end_node_id, i, extn_sup, nplu, 
    edgecut, cur_global_col, read_element_num, write_element_num, group_num;
  int64_t *global_cols, *index_to_global_cols, *my_outdegs, *rows, 
    *ptr_offsets, *sizes, *addrs, *read_elements, *write_elements, *tmp_outdegs;
  colindex_t *sorted_colindices;
  double mu, sigma, rho;
  
  graph = DMI_thread_malloc(sizeof(graph_t));
  _used_memory += sizeof(graph_t);
  
  for(rank = 0; rank < pnum; rank++)
    {
      quota[rank] = part_num / pnum;
    }
  for(rank = 0; rank < part_num % pnum; rank++)
    {
      quota[rank]++;
    }
  cum_quota[0] = 0;
  for(rank = 0; rank < pnum; rank++)
    {
      cum_quota[rank + 1] = cum_quota[rank] + quota[rank];
    }
  
  if(my_rank == 0)
    outn("=== file reading ===");
#if DEBUG0
  if(pnum != 4)
    {
      outn("pnum != 4");
      error();
    }
  mu = 0;
  sigma = 0;
  rho = 0;
  graph->n = 4;
  graph->nn = 4;
  graph->total_n = 16;
  graph->nplu = 12;
  graph->begin_node_id = graph->n * my_rank;
  graph->end_node_id = graph->n * (my_rank + 1);
  graph->inners = DMI_thread_malloc(graph->n * sizeof(int64_t));
  _used_memory += graph->n * sizeof(int64_t);
  graph->rows = DMI_thread_malloc((graph->n + 1) * sizeof(int64_t));
  _used_memory += (graph->n + 1) * sizeof(int64_t);
  global_cols = my_malloc(graph->nplu * sizeof(int64_t));
  _used_memory += graph->nplu * sizeof(int64_t);
  graph->cols = DMI_thread_malloc(graph->nplu * sizeof(int64_t));
  _used_memory += graph->nplu * sizeof(int64_t);
  graph->ds = DMI_thread_malloc(graph->nplu * sizeof(double));
  _used_memory += graph->nplu * sizeof(double);
  if(my_rank == 0)
    {
      set_array(graph->inners, 4, 0, 1, 2, 3);
      set_array(graph->rows, 5, 0, 2, 5, 8, 12);
      set_array(global_cols, 12, 1, 2, 0, 3, 4, 0, 3, 8, 1, 2, 9, 6);
    }
  else if(my_rank == 1)
    {
      set_array(graph->inners, 4, 4, 5, 6, 7);
      set_array(graph->rows, 5, 0, 3, 5, 9, 12);
      set_array(global_cols, 12, 1, 6, 5, 4, 7, 4, 3, 12, 7, 5, 6, 13);
    }
  else if(my_rank == 2)
    {
      set_array(graph->inners, 4, 8, 9, 10, 11);
      set_array(graph->rows, 5, 0, 3, 7, 9, 12);
      set_array(global_cols, 12, 2, 9, 10, 3, 8, 11, 12, 8, 11, 10, 9, 14);
    }
  else
    {
      set_array(graph->inners, 4, 12, 13, 14, 15);
      set_array(graph->rows, 5, 0, 4, 7, 10, 12);
      set_array(global_cols, 12, 6, 9, 14, 13, 7, 12, 15, 11, 12, 15, 14, 13);
    }
#else
  graph->n = 0;
  graph->nplu = 0;
  graph->begin_node_id = -1;
  graph->end_node_id = -1;
  graph->inners = NULL;
  graph->rows = DMI_thread_malloc(sizeof(int64_t));
  graph->rows[0] = 0;
  global_cols = NULL;
  graph->cols = NULL;
  graph->ds = NULL;
  for(part = cum_quota[my_rank]; part < cum_quota[my_rank + 1]; part++)
    {
      sprintf(file, "%s/%s_%d_%d.dat", DIR, dataset, part, part_num);
      fp = fopen(file, "rb");
      if(fp == NULL) error();
      ret = fread(&mu, sizeof(double), 1, fp);
      if(ret != 1) error();
      ret = fread(&sigma, sizeof(double), 1, fp);
      if(ret != 1) error();
      ret = fread(&rho, sizeof(double), 1, fp);
      if(ret != 1) error();
      ret = fread(&graph->nn, sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      ret = fread(&nplu, sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      ret = fread(&begin_node_id, sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      ret = fread(&end_node_id, sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      if(graph->nn != end_node_id - begin_node_id) error();
      graph->n += graph->nn;
      graph->nplu += nplu;
      if(part == cum_quota[my_rank])
        {
          graph->begin_node_id = begin_node_id;
        }
      if(part == cum_quota[my_rank + 1] - 1)
        {
          graph->end_node_id = end_node_id;
        }
      graph->inners = DMI_thread_realloc(graph->inners, graph->n * sizeof(int64_t));
      node_id = begin_node_id;
      for(i = graph->n - graph->nn; i < graph->n; i++)
        {
          graph->inners[i] = node_id++;
        }
      if(node_id != end_node_id) error();
      graph->rows = DMI_thread_realloc(graph->rows, (graph->n + 1) * sizeof(int64_t));
      rows = my_malloc((graph->nn + 1) * sizeof(int64_t));
      ret = fread(rows, (graph->nn + 1) * sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      for(i = graph->n - graph->nn; i < graph->n; i++)
        {
          graph->rows[i + 1] = graph->rows[i];
        }
      for(i = graph->n - graph->nn; i < graph->n + 1; i++)
        {
          graph->rows[i] += rows[i - graph->n + graph->nn];
        }
      my_free(rows);
      global_cols = my_realloc(global_cols, graph->nplu * sizeof(int64_t));
      graph->cols = DMI_thread_realloc(graph->cols, graph->nplu * sizeof(int64_t));
      ret = fread(global_cols + graph->nplu - nplu, nplu * sizeof(int64_t), 1, fp);
      if(ret != 1) error();
      graph->ds = DMI_thread_realloc(graph->ds, graph->nplu * sizeof(double));
      ret = fread(graph->ds + graph->nplu - nplu, nplu * sizeof(double), 1, fp);
      if(ret != 1) error();
      fclose(fp);
    }
  graph->total_n = graph->nn * part_num;
  _tmp_memory += graph->nplu * sizeof(int64_t);
  _used_memory += graph->n * sizeof(int64_t);
  _used_memory += (graph->n + 1) * sizeof(int64_t);
  _used_memory += graph->nplu * sizeof(int64_t);
  _used_memory += graph->nplu * sizeof(double);
#if DEBUG_THREADMALLOC
  outn("thread_malloc: graph->inners=%ld MB", graph->n * sizeof(int64_t) / 1024.0 / 1024.0);
  outn("thread_malloc: graph->rows=%ld MB", (graph->n + 1) * sizeof(int64_t) / 1024.0 / 1024.0);
  outn("thread_malloc: graph->cols=%ld MB", graph->nplu * sizeof(int64_t) / 1024.0 / 1024.0);
  outn("thread_malloc: graph->ds=%ld MB", graph->nplu * sizeof(double) / 1024.0 / 1024.0);
#endif  
#endif

#if DEBUG1
  halt(my_rank);
#endif
#if DEBUG1
  outn("%d nn=%d n=%d nplu=%d", my_rank, graph->nn, graph->n, graph->nplu);
  print_array(graph->inners, graph->n, "inners", my_rank, pnum);
  print_array(graph->rows, graph->n + 1, "rows", my_rank, pnum);
  print_array(global_cols, graph->nplu, "global_cols", my_rank, pnum);
#endif
  
  if(my_rank == 0)
    outn("=== translating global indexes to local indexes ===");
  extn_sup = 0;
  for(i = 0; i < graph->nplu; i++)
    {
      if(!(graph->begin_node_id <= global_cols[i] && global_cols[i] < graph->end_node_id))
        {
          extn_sup++;
        }
    }
  sorted_colindices = my_malloc(extn_sup * sizeof(colindex_t));
  _tmp_memory += extn_sup * sizeof(colindex_t);
  extn_sup = 0;
  for(i = 0; i < graph->nplu; i++)
    {
      if(!(graph->begin_node_id <= global_cols[i] && global_cols[i] < graph->end_node_id))
        {
          sorted_colindices[extn_sup].global_col = global_cols[i];
          sorted_colindices[extn_sup].index = i;
          extn_sup++;
        }
    }
  
  fastsort(extn_sup, sorted_colindices);
  
  index_to_global_cols = my_malloc((graph->n + extn_sup) * sizeof(int64_t));
  _tmp_memory += (graph->n + extn_sup) * sizeof(int64_t);
  for(i = 0; i < graph->n; i++)
    {
      index_to_global_cols[i] = i + graph->begin_node_id;
    }
  
  graph->outdegs = DMI_thread_malloc(graph->n * sizeof(int64_t));
  _used_memory += graph->n * sizeof(int64_t);
#if DEBUG_THREADMALLOC
  outn("thread_malloc: graph->outdegs=%ld MB", graph->n * sizeof(int64_t) / 1024.0 / 1024.0);
#endif  
  for(i = 0; i < graph->n; i++)
    {
      graph->outdegs[i] = 0;
    }
  ptr_offsets = my_malloc(extn_sup * sizeof(int64_t));
  _tmp_memory += extn_sup * sizeof(int64_t);
  addrs = my_malloc(extn_sup * sizeof(int64_t));
  _tmp_memory += extn_sup * sizeof(int64_t);
  sizes = my_malloc(extn_sup * sizeof(int64_t));
  _tmp_memory += extn_sup * sizeof(int64_t);
  my_outdegs = my_malloc(extn_sup * sizeof(int64_t));
  _tmp_memory += extn_sup * sizeof(int64_t);
  
  cur_global_col = -1;
  graph->extn = graph->n;
  group_num = 0;
  for(i = 0; i < extn_sup; i++)
    {
      if(sorted_colindices[i].global_col != cur_global_col)
        {
          index_to_global_cols[graph->extn] = sorted_colindices[i].global_col;
          graph->cols[sorted_colindices[i].index] = graph->extn;
          ptr_offsets[group_num] = group_num * sizeof(int64_t);
          addrs[group_num] = outdegs_addr + sorted_colindices[i].global_col * sizeof(int64_t);
          sizes[group_num] = sizeof(int64_t);
          my_outdegs[group_num] = 1;
          group_num++;
          graph->extn++;
        }
      else
        {
          graph->cols[sorted_colindices[i].index] = graph->extn - 1;
          my_outdegs[group_num - 1]++;
        }
      cur_global_col = sorted_colindices[i].global_col;
    }
  for(i = 0; i < graph->nplu; i++)
    {
      if(graph->begin_node_id <= global_cols[i] && global_cols[i] < graph->end_node_id)
        {
          graph->cols[i] = global_cols[i] - graph->begin_node_id;
          graph->outdegs[global_cols[i] - graph->begin_node_id]++;
        }
    }
  catch(DMI_write(outdegs_addr + graph->begin_node_id * sizeof(int64_t), graph->n * sizeof(int64_t), graph->outdegs, DMI_EXCLUSIVE, NULL));
  
#if DEBUG1
  outn("%d extn=%d", my_rank, graph->extn);
  print_array(graph->outdegs, graph->n, "graph->outdegs", my_rank, pnum);
  print_array(index_to_global_cols, graph->extn, "index_to_global_cols", my_rank, pnum);
  print_array(graph->cols, graph->nplu, "cols", my_rank, pnum);
#endif
  
  my_free(global_cols);
  my_free(sorted_colindices);
  
  catch(DMI_local_barrier_sync(local_barrier, pnum)); /* bug??? */
  
  catch(DMI_group_init(&group, addrs, ptr_offsets, sizes, group_num));
  
  my_free(sizes);
  my_free(addrs);
  my_free(ptr_offsets);
  
  tmp_outdegs = my_malloc(extn_sup * sizeof(int64_t));
  _tmp_memory += extn_sup * sizeof(int64_t);
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  for(rank = 0; rank < pnum; rank++)
    {
      if(rank == my_rank)
        {
          catch(DMI_group_read(&group, tmp_outdegs, DMI_GET, NULL));
          for(i = 0; i < group_num; i++)
            {
              tmp_outdegs[i] += my_outdegs[i];
            }
#if DEBUG1
          print_array(tmp_outdegs, group_num, "tmp_outdegs", my_rank, pnum);
#endif
          catch(DMI_group_write(&group, tmp_outdegs, DMI_PUT, NULL));
        }
      catch(DMI_local_barrier_sync(local_barrier, pnum));
    }
  
  catch(DMI_group_destroy(&group));
  
  catch(DMI_read(outdegs_addr + graph->begin_node_id * sizeof(int64_t), graph->n * sizeof(int64_t), graph->outdegs, DMI_GET, NULL));
#if DEBUG1
  halt(my_rank);
#endif
#if DEBUG1
  print_array(graph->outdegs, graph->n, "graph->outdegs", my_rank, pnum);
#endif
  
  my_free(tmp_outdegs);
  my_free(my_outdegs);
  
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &extn_sup, &edgecut, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    {
      outn("nn=%ld total_n=%ld edgecut=%d mu=%lf sigma=%lf rho=%lf", 
           graph->nn, graph->nn * pnum, edgecut, mu, sigma, rho);
      for(rank = 0; rank < pnum; rank++)
        {
          out("%d ", quota[rank]);
        }
      outn("");
    }
  
#if DEBUG1
  halt(my_rank);
#endif
  write_element_num = graph->n;
  read_element_num = graph->extn - graph->n;
  write_elements = my_malloc(sizeof(int64_t) * write_element_num);
  _tmp_memory += sizeof(int64_t) * write_element_num;
  read_elements = my_malloc(sizeof(int64_t) * read_element_num);
  _tmp_memory += sizeof(int64_t) * read_element_num;
  for(i = 0; i < write_element_num; i++)
    {
      write_elements[i] = graph->inners[i];
    }
  for(i = 0; i < read_element_num; i++)
    {
      read_elements[i] = index_to_global_cols[i + graph->n];
    }
#if DEBUG1
  print_array(write_elements, write_element_num, "write_elements", my_rank, pnum);
  print_array(read_elements, read_element_num, "read_elements", my_rank, pnum);
#endif
  
  my_free(index_to_global_cols);
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_rwset_decompose(rwset_addr, my_rank, write_elements, write_element_num));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  catch(DMI_local_rwset_init(&graph->local_rwset, rwset_addr, my_rank, read_elements, read_element_num));
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  
  my_free(read_elements);
  my_free(write_elements);
  
  graph->values = DMI_thread_malloc(graph->n * sizeof(double));
  _used_memory += graph->n * sizeof(double);
  graph->ext_values = DMI_thread_malloc(graph->extn * sizeof(double));
  _used_memory += graph->extn * sizeof(double);
#if DEBUG_THREADMALLOC
  outn("thread_malloc: graph->values=%ld MB", graph->n * sizeof(double) / 1024.0 / 1024.0);
  outn("thread_malloc: graph->ext_values=%ld MB", graph->extn * sizeof(double) / 1024.0 / 1024.0);
#endif
  if(my_rank == 0)
    outn("=== graph created ===");
  return graph;
}

void graph_free(graph_t *graph)
{
  catch(DMI_local_rwset_destroy(&graph->local_rwset));
  
  DMI_thread_free(graph->ext_values);
  DMI_thread_free(graph->values);
  DMI_thread_free(graph->outdegs);
  DMI_thread_free(graph->ds);
  DMI_thread_free(graph->rows);
  DMI_thread_free(graph->cols);
  DMI_thread_free(graph->inners);
  DMI_thread_free(graph);
  return;
}

void graph_init(graph_t *graph, int32_t my_rank, int32_t pnum)
{
  int64_t i;
  
  for(i = 0; i < graph->n; i++)
    {
      graph->values[i] = 1.0 / graph->total_n;
      if(graph->outdegs[i] > 0)
        {
          graph->values[i] /= graph->outdegs[i];
        }
    }
  catch(DMI_local_rwset_write(&graph->local_rwset, graph->values, NULL));
  return;
}

void graph_propagate(graph_t *graph, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum)
{
  int64_t i, j;
  double s;
  double t0, t1, t2, t3, t4, t5;
  
  time_lap(30);
  time_lap(20);
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  t0 = time_diff(20);
  time_lap(21);
  catch(DMI_local_rwset_write(&graph->local_rwset, graph->values, NULL));
  t1 = time_diff(21);
  time_lap(22);
  memcpy(graph->ext_values, graph->values, graph->n * sizeof(double));
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  t2 = time_diff(22);
  time_lap(23);
  catch(DMI_local_rwset_read(&graph->local_rwset, graph->ext_values + graph->n, NULL));
  t3 = time_diff(23);
  
  time_lap(24);
  time_lap(60);
  for(i = 0; i < graph->n; i++)
    {
      if(graph->outdegs[i] > 0)
        {
          graph->values[i] = 0;
        }
      s = graph->values[i];
      for(j = graph->rows[i]; j < graph->rows[i + 1]; j++)
        {
          //if(graph->cols[j] > graph->extn) error();
          s += graph->ext_values[graph->cols[j]];
        }
      graph->values[i] = DUMPING_FACTOR * s + (1 - DUMPING_FACTOR) / graph->total_n;
      if(graph->outdegs[i] > 0)
        {
          graph->values[i] /= graph->outdegs[i];
        }
    }
  t_calc += time_diff(60);
  t4 = time_diff(24);
  t5 = time_diff(30);
#if 0
  print_time(t0, local_barrier, my_rank, pnum, "first_sync_time");
  print_time(t1, local_barrier, my_rank, pnum, "write_time");
  print_time(t2, local_barrier, my_rank, pnum, "second_sync_time");
  print_time(t3, local_barrier, my_rank, pnum, "read_time");
  print_time(t4, local_barrier, my_rank, pnum, "calc_time");
  print_time(t5, local_barrier, my_rank, pnum, "total_time");
#endif
  return;
}

void graph_dummy_propagate(graph_t *graph, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum)
{
  int64_t i, j;
  double s;
  
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  catch(DMI_local_rwset_write(&graph->local_rwset, graph->values, NULL));
  memcpy(graph->ext_values, graph->values, graph->n * sizeof(double));
  catch(DMI_local_barrier_sync(local_barrier, pnum));
  catch(DMI_local_rwset_read(&graph->local_rwset, graph->ext_values + graph->n, NULL));
  
  s = 0;
  for(i = 0; i < graph->n; i++)
    {
      s += graph->values[i];
    }
  for(i = 0; i < graph->n; i++)
    {
      for(j = graph->rows[i]; j < graph->rows[i + 1]; j++)
        {
          if(graph->cols[j] > graph->extn) error();
          graph->ext_values[graph->cols[j]] = s;
        }
    }
  return;
}

void graph_allreduce(graph_t *graph, double *sum_ptr, double *max_ptr, double *min_ptr, int8_t flag, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum)
{
  int64_t i;
  double value, sub_sum, sub_min, sub_max;
  
  sub_sum = 0;
  sub_min = 1e234;
  sub_max = -1e234;
  for(i = 0; i < graph->n; i++)
    {
      value = graph->values[i];
      if(graph->outdegs[i] > 0)
        {
          value *= graph->outdegs[i];
        }
      sub_sum += value;
      if(sub_max < value)
        {
          sub_max = value;
        }
      if(sub_min > value)
        {
          sub_min = value;
        }
    }
  
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &sub_sum, sum_ptr, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  if(flag == TRUE)
    {
      catch(DMI_local_barrier_allreduce(local_barrier, pnum, &sub_max, max_ptr, DMI_OP_MAX, DMI_TYPE_DOUBLE));
      catch(DMI_local_barrier_allreduce(local_barrier, pnum, &sub_min, min_ptr, DMI_OP_MIN, DMI_TYPE_DOUBLE));
    }
  else
    {
      *max_ptr = 0;
      *min_ptr = 0;
    }
  return;
}

int32_t calc_dst_rank(int32_t part, int32_t part_num, int32_t pnum)
{
  int32_t u, a, b;
  
  a = part_num % pnum;
  b = part_num / pnum;
  u = part / (b + 1);
  if(u < a)
    {
      return u;
    }
  return (part - a * (b + 1)) / b + a;
}

void fastsort(int64_t n, colindex_t *colindices)
{
  colindex_t pivot, tmp;
  int64_t i, j, left, right, p;
  int64_t lstack[32], rstack[32];
  
  left = 0;
  right = n - 1;
  p = 0;
  while(1)
    {
      if(right - left <= CONST_FASTSORT_THRESHOLD)
        {
          if(p == 0) break;
          p--;
          left = lstack[p];
          right = rstack[p];
        }
      pivot = colindices[(left + right) / 2];
      i = left;
      j = right;
      while(1)
        {
          while(colindices[i].global_col < pivot.global_col)
            {
              i++;
            }
          while(colindices[j].global_col > pivot.global_col)
            {
              j--;
            }
          if(i >= j) break;
          tmp = colindices[i];
          colindices[i] = colindices[j];
          colindices[j] = tmp;
          i++;
          j--;
        }
      if(i - left > right - j)
        {
          if(i - left > CONST_FASTSORT_THRESHOLD)
            {
              lstack[p] = left;
              rstack[p] = i - 1;
              p++;
            }
          left = j + 1;
        }
      else
        {
          if(right - j > CONST_FASTSORT_THRESHOLD)
            {
              lstack[p] = j + 1;
              rstack[p] = right;
              p++;
            }
          right = i - 1;
        }
    }
  
  for(i = 0; i < n; i++)
    {
      tmp = colindices[i];
      for(j = i; j > 0 && colindices[j - 1].global_col > tmp.global_col; j--)
        {
          colindices[j] = colindices[j - 1];
        }
      colindices[j] = tmp;
    }
  return;
}

void set_array(int64_t *array, int64_t num, ...)
{
  va_list list;
  int64_t i;
  
  va_start(list, num);
  for(i = 0; i < num; i++)
    {
      array[i] = va_arg(list, int64_t);
    }
  va_end(list);
  return;
}

void print_array(int64_t *array, int64_t num, char *str, int32_t my_rank, int32_t pnum)
{
  int64_t i;
  
  out("%d %s: ", my_rank, str);
  for(i = 0; i < num; i++)
    {
      out("%d ", array[i]);
    }
  outn("");
  return;
}

void print_time(double t, DMI_local_barrier_t *local_barrier, int32_t my_rank, int32_t pnum, char *str)
{
  double t_sum, t_max, t_min;
  
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_max, DMI_OP_MAX, DMI_TYPE_DOUBLE));
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_min, DMI_OP_MIN, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    {
      outn("%s avg=%lf max=%lf min=%lf", str, t_sum / pnum, t_max, t_min);
    }
  return;
}

void print_calctime(double total, double t, DMI_local_barrier_t *local_barrier, int my_rank, int pnum)
{
  double t_sum, t_max, t_min;

  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_sum, DMI_OP_SUM, DMI_TYPE_DOUBLE));
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_max, DMI_OP_MAX, DMI_TYPE_DOUBLE));
  catch(DMI_local_barrier_allreduce(local_barrier, pnum, &t, &t_min, DMI_OP_MIN, DMI_TYPE_DOUBLE));
  if(my_rank == 0)
    {
      outn("pnum=%d time=%.12lf calc_max=%.12lf calc_min=%.12lf calc_avg=%.12lf comm=%.12lf", 
           pnum, total, t_max, t_min, t_sum / pnum, total - t_max);
    }
  return;
}
