GraphChi  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Macros
labelanalysis.hpp
Go to the documentation of this file.
1 
33 #include <vector>
34 #include <algorithm>
35 #include <errno.h>
36 #include <assert.h>
37 
38 #include "logger/logger.hpp"
39 #include "util/merge.hpp"
40 #include "util/ioutil.hpp"
41 #include "util/qsort.hpp"
42 #include "api/chifilenames.hpp"
43 
44 #ifndef DEF_GRAPHCHI_LABELANALYSIS
45 #define DEF_GRAPHCHI_LABELANALYSIS
46 
47 using namespace graphchi;
48 
49 template <typename LabelType>
50 struct labelcount_tt {
51  LabelType label;
52  unsigned int count; // Count excludes the vertex which has its own id as the label. (Important optimization)
53  labelcount_tt(LabelType l, int c) : label(l), count(c) {}
54  labelcount_tt() {}
55 };
56 
57 template <typename LabelType>
58 bool label_count_greater(const labelcount_tt<LabelType> &a, const labelcount_tt<LabelType> &b) {
59  return a.count > b.count;
60 }
61 
62 template <typename LabelType>
63 void analyze_labels(std::string base_filename, int printtop = 20) {
64  typedef labelcount_tt<LabelType> labelcount_t;
70  std::string filename = filename_vertex_data<LabelType>(base_filename);
71  int f = open(filename.c_str(), O_RDONLY);
72 
73  if (f < 0) {
74  logstream(LOG_ERROR) << "Could not open file: " << filename <<
75  " error: " << strerror(errno) << std::endl;
76  return;
77  }
78  size_t sz = lseek(f, 0, SEEK_END);
79 
80  /* Setup buffer sizes */
81  size_t bufsize = 1024 * 1024; // Read one megabyte a time
82  int nbuf = bufsize / sizeof(LabelType);
83 
84  std::vector<labelcount_t> curlabels;
85  size_t nread = 0;
86  bool first = true;
87  vid_t curvid = 0;
88  LabelType * buffer = (LabelType*) calloc(nbuf, sizeof(LabelType));
89 
90  while (nread < sz) {
91  size_t len = std::min(sz - nread, bufsize);
92  preada(f, buffer, len, nread);
93  nread += len;
94 
95  int nt = len / sizeof(LabelType);
96 
97  /* Mark vertices with its own label with 0xffffffff so they will be ignored */
98  for(int i=0; i < nt; i++) {
99  LabelType l = buffer[i];
100  if (l == curvid) buffer[i] = 0xffffffff;
101  curvid++;
102  }
103 
104  /* First sort the buffer */
105  quickSort(buffer, nt, std::less<LabelType>());
106 
107  /* Then collect */
108  std::vector<labelcount_t> newlabels;
109  newlabels.reserve(nt);
110  vid_t lastlabel = 0xffffffff;
111  for(int i=0; i < nt; i++) {
112  if (buffer[i] != 0xffffffff) {
113  if (buffer[i] != lastlabel) {
114  newlabels.push_back(labelcount_t(buffer[i], 1));
115  } else {
116  newlabels[newlabels.size() - 1].count ++;
117  }
118  lastlabel = buffer[i];
119  }
120 
121  /* Check sorted (sanity check) */
122  // for(int i=1; i < newlabels.size(); i++) {
123  // assert(newlabels[i].label > newlabels[i-1].label);
124  // }
125  }
126 
127  if (first) {
128  for(int i=0; i < (int)newlabels.size(); i++) {
129  curlabels.push_back(newlabels[i]);
130  }
131 
132  /* Check sorted (sanity check) */
133  //for(int i=1; i < curlabels.size(); i++) {
134  // assert(curlabels[i].label > curlabels[i-1].label);
135  //}
136  } else {
137  /* Merge current and new label counts */
138  int cl = 0;
139  int nl = 0;
140  std::vector< labelcount_t > merged;
141  merged.reserve(curlabels.size() + newlabels.size());
142  while(cl < (int)curlabels.size() && nl < (int)newlabels.size()) {
143  if (newlabels[nl].label == curlabels[cl].label) {
144  merged.push_back(labelcount_t(newlabels[nl].label, newlabels[nl].count + curlabels[cl].count));
145  nl++; cl++;
146  } else {
147  if (newlabels[nl].label < curlabels[cl].label) {
148  merged.push_back(newlabels[nl]);
149  nl++;
150  } else {
151  merged.push_back(curlabels[cl]);
152  cl++;
153  }
154  }
155  }
156  while(cl < (int)curlabels.size()) merged.push_back(curlabels[cl++]);
157  while(nl < (int)newlabels.size()) merged.push_back(newlabels[nl++]);
158 
159  curlabels = merged;
160 
161  /* Check sorted (sanity check) */
162  // for(int i=1; i < curlabels.size(); i++) {
163  // assert(curlabels[i].label > curlabels[i-1].label);
164  // }
165  }
166 
167  first = false;
168  }
169 
170  /* Sort */
171  std::sort(curlabels.begin(), curlabels.end(), label_count_greater<LabelType>);
172 
173  /* Write output file */
174  std::string outname = base_filename + "_components.txt";
175  FILE * resf = fopen(outname.c_str(), "w");
176  if (resf == NULL) {
177  logstream(LOG_ERROR) << "Could not write label outputfile : " << outname << std::endl;
178  return;
179  }
180  for(int i=0; i < (int) curlabels.size(); i++) {
181  fprintf(resf, "%u,%u\n", curlabels[i].label, curlabels[i].count + 1);
182  }
183  fclose(resf);
184 
185  std::cout << "Total number of different labels (components/communities): " << curlabels.size() << std::endl;
186  std::cout << "List of labels was written to file: " << outname << std::endl;
187 
188  for(int i=0; i < (int)std::min((size_t)printtop, curlabels.size()); i++) {
189  std::cout << (i+1) << ". label: " << curlabels[i].label << ", size: " << curlabels[i].count << std::endl;
190  }
191 }
192 
193 #endif
194 
195