39 #include "util/merge.hpp"
41 #include "util/qsort.hpp"
44 #ifndef DEF_GRAPHCHI_LABELANALYSIS
45 #define DEF_GRAPHCHI_LABELANALYSIS
47 using namespace graphchi;
49 template <
typename LabelType>
57 template <
typename LabelType>
59 return a.count > b.count;
62 template <
typename LabelType>
70 std::string filename = filename_vertex_data<LabelType>(base_filename);
71 int f = open(filename.c_str(), O_RDONLY);
74 logstream(
LOG_ERROR) <<
"Could not open file: " << filename <<
75 " error: " << strerror(errno) << std::endl;
78 size_t sz = lseek(f, 0, SEEK_END);
81 size_t bufsize = 1024 * 1024;
82 int nbuf = bufsize /
sizeof(LabelType);
84 std::vector<labelcount_t> curlabels;
88 LabelType * buffer = (LabelType*) calloc(nbuf,
sizeof(LabelType));
91 size_t len = std::min(sz - nread, bufsize);
92 preada(f, buffer, len, nread);
95 int nt = len /
sizeof(LabelType);
98 for(
int i=0; i < nt; i++) {
99 LabelType l = buffer[i];
100 if (l == curvid) buffer[i] = 0xffffffff;
105 quickSort(buffer, nt, std::less<LabelType>());
108 std::vector<labelcount_t> newlabels;
109 newlabels.reserve(nt);
110 vid_t lastlabel = 0xffffffff;
111 for(
int i=0; i < nt; i++) {
112 if (buffer[i] != 0xffffffff) {
113 if (buffer[i] != lastlabel) {
114 newlabels.push_back(labelcount_t(buffer[i], 1));
116 newlabels[newlabels.size() - 1].count ++;
118 lastlabel = buffer[i];
128 for(
int i=0; i < (int)newlabels.size(); i++) {
129 curlabels.push_back(newlabels[i]);
140 std::vector< labelcount_t > merged;
141 merged.reserve(curlabels.size() + newlabels.size());
142 while(cl < (
int)curlabels.size() && nl < (int)newlabels.size()) {
143 if (newlabels[nl].label == curlabels[cl].label) {
144 merged.push_back(labelcount_t(newlabels[nl].label, newlabels[nl].count + curlabels[cl].count));
147 if (newlabels[nl].label < curlabels[cl].label) {
148 merged.push_back(newlabels[nl]);
151 merged.push_back(curlabels[cl]);
156 while(cl < (
int)curlabels.size()) merged.push_back(curlabels[cl++]);
157 while(nl < (
int)newlabels.size()) merged.push_back(newlabels[nl++]);
171 std::sort(curlabels.begin(), curlabels.end(), label_count_greater<LabelType>);
174 std::string outname = base_filename +
"_components.txt";
175 FILE * resf = fopen(outname.c_str(),
"w");
177 logstream(
LOG_ERROR) <<
"Could not write label outputfile : " << outname << std::endl;
180 for(
int i=0; i < (int) curlabels.size(); i++) {
181 fprintf(resf,
"%u,%u\n", curlabels[i].label, curlabels[i].count + 1);
185 std::cout <<
"Total number of different labels (components/communities): " << curlabels.size() << std::endl;
186 std::cout <<
"List of labels was written to file: " << outname << std::endl;
188 for(
int i=0; i < (int)std::min((
size_t)printtop, curlabels.size()); i++) {
189 std::cout << (i+1) <<
". label: " << curlabels[i].label <<
", size: " << curlabels[i].count << std::endl;