GraphChi  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Macros
toplist.hpp
Go to the documentation of this file.
1 
29 #ifndef DEF_GRAPHCHI_TOPLIST
30 #define DEF_GRAPHCHI_TOPLIST
31 
32 #include <vector>
33 #include <algorithm>
34 #include <errno.h>
35 #include <assert.h>
36 
37 #include "logger/logger.hpp"
38 #include "util/merge.hpp"
39 #include "util/ioutil.hpp"
40 #include "util/qsort.hpp"
41 #include "api/chifilenames.hpp"
42 
43 namespace graphchi {
44 
45  template <typename VertexDataType>
46  struct vertex_value {
47  vid_t vertex;
48  VertexDataType value;
49  vertex_value() {}
50  vertex_value(vid_t v, VertexDataType x) : vertex(v), value(x) {}
51  };
52 
53  template <typename VertexDataType>
54  bool vertex_value_greater(const vertex_value<VertexDataType> &a, const vertex_value<VertexDataType> &b) {
55  return a.value > b.value;
56  }
57 
67  template <typename VertexDataType>
68  std::vector<vertex_value<VertexDataType> > get_top_vertices(std::string basefilename, int ntop) {
69  typedef vertex_value<VertexDataType> vv_t;
70  std::string filename = filename_vertex_data<VertexDataType>(basefilename);
71  int f = open(filename.c_str(), O_RDONLY);
72  if (f < 0) {
73  logstream(LOG_ERROR) << "Could not open file: " << filename <<
74  " error: " << strerror(errno) << std::endl;
75  return std::vector<vertex_value<VertexDataType> >();
76  }
77  size_t sz = lseek(f, 0, SEEK_END);
78 
79  /* Setup buffer sizes */
80  int nverts = (int) (sz / sizeof(VertexDataType));
81  ntop = std::min(ntop, nverts);
82 
83  size_t bufsize = 1024 * 1024; // Read one megabyte a time
84  int nbuf = (int) (bufsize / sizeof(VertexDataType));
85  bufsize = sizeof(VertexDataType) * nbuf; // Ensure that data type size divides bufsize
86  assert(bufsize % sizeof(VertexDataType) == 0);
87 
88  /* Initialize buffer */
89  VertexDataType * buffer = (VertexDataType*) calloc(nbuf, sizeof(VertexDataType));
90  vv_t * buffer_idxs = (vv_t*) calloc(nbuf, sizeof(vv_t));
91  vv_t * topbuf = (vv_t*) calloc(ntop, sizeof(vv_t));
92  vv_t * mergearr = (vv_t*) calloc(ntop * 2, sizeof(vv_t));
93 
94  /* Read the data and maintain the top-list */
95  size_t nread = 0;
96  size_t idx = 0;
97  int count = 0;
98  while (nread < sz) {
99  size_t len = std::min(sz - nread, bufsize);
100  preada(f, buffer, len, nread);
101  nread += len;
102 
103  int nt = (int) (len / sizeof(VertexDataType));
104  int k = 0;
105  VertexDataType minima = VertexDataType();
106  if (count > 0) {
107  minima = topbuf[ntop - 1].value; // Minimum value that should be even considered
108  }
109  for(int j=0; j < nt; j++) {
110  if (count == 0 || (buffer[j] > minima)) {
111  buffer_idxs[k] = vv_t((vid_t)idx, buffer[j]);
112  k++;
113  }
114  idx++;
115  }
116  nt = k; /* How many were actually included */
117 
118  /* Sort buffer-idxs */
119  quickSort(buffer_idxs, nt, vertex_value_greater<VertexDataType>);
120 
121  /* Merge the top with the current top */
122  if (count == 0) {
123  /* Nothing to merge, just copy */
124  memcpy(topbuf, buffer_idxs, ntop * sizeof(vv_t));
125  } else {
126  // void merge(ET* S1, int l1, ET* S2, int l2, ET* R, F f) {
127  merge<vv_t>(topbuf, ntop, buffer_idxs, std::min(ntop, nt), mergearr, vertex_value_greater<VertexDataType>);
128  memcpy(topbuf, mergearr, ntop * sizeof(vv_t));
129  }
130 
131  count++;
132  }
133 
134  /* Return */
135  std::vector< vv_t > ret;
136  for(int i=0; i < ntop; i++) {
137  ret.push_back(topbuf[i]);
138  }
139  free(buffer);
140  free(buffer_idxs);
141  free(mergearr);
142  free(topbuf);
143  close(f);
144  return ret;
145  }
146 
147 
148 };
149 
150 #endif
151