GraphChi  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Macros
als.hpp
Go to the documentation of this file.
1 
2 
32 #ifndef DEF_ALSHPP
33 #define DEF_ALSHPP
34 
35 #include <assert.h>
36 #include <cmath>
37 #include <errno.h>
38 #include <string>
39 #include <stdint.h>
40 
41 #include "matrixmarket/mmio.h"
42 #include "matrixmarket/mmio.c"
43 
44 #include "api/chifilenames.hpp"
47 
48 // See note above about Eigen
49 #include "Eigen/Dense"
50 #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
51 #include "Eigen/Sparse"
52 #include "Eigen/Cholesky"
53 #include "Eigen/Eigenvalues"
54 #include "Eigen/SVD"
55 using namespace Eigen;
56 
57 
58 typedef MatrixXd mat;
59 typedef VectorXd vec;
60 typedef VectorXi ivec;
61 typedef MatrixXi imat;
62 typedef SparseVector<double> sparse_vec;
63 
64 using namespace graphchi;
65 
66 
67 #ifndef NLATENT
68 #define NLATENT 5 // Dimension of the latent factors. You can specify this in compile time as well (in make).
69 #endif
70 
71 double LAMBDA = 0.065;
72 
74 double rmse=0.0;
75 mutex rmselock;
76 
77 
78 // Hackish: we need to count the number of left
79 // and right vertices in the bipartite graph ourselves.
80 vid_t max_left_vertex =0 ;
81 vid_t max_right_vertex = 0;
82 
83 struct latentvec_t {
84  double d[NLATENT];
85 
86  latentvec_t() {
87  }
88 
89  void init() {
90  for(int k=0; k < NLATENT; k++) d[k] = 0.001 * (std::rand() % 1000);
91  }
92 
93  double & operator[] (int idx) {
94  return d[idx];
95  }
96  bool operator!=(const latentvec_t &oth) const {
97  for(int i=0; i<NLATENT; i++) { if (d[i] != oth.d[i]) return true; }
98  return false;
99  }
100 
101  double dot(latentvec_t &oth) const {
102  double x=0;
103  for(int i=0; i<NLATENT; i++) x+= oth.d[i]*d[i];
104  return x;
105  }
106 
107 };
108 
109 
110 
112  latentvec_t factor;
113  float weight;
114 
116 
117  als_factor_and_weight(float obs) {
118  weight = obs;
119  factor.init();
120  }
121 };
122 
123 
124 
130 template <typename als_edge_type>
131 int convert_matrixmarket_for_ALS(std::string base_filename) {
132  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
133  int ret_code;
134  MM_typecode matcode;
135  FILE *f;
136  int M, N, nz;
137 
141  int nshards;
142  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {
143  logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
144  logstream(LOG_INFO) << "If this is not intended, please delete the shard files and try again. " << std::endl;
145  return nshards;
146  }
147 
148  sharder<als_edge_type> sharderobj(base_filename);
149  sharderobj.start_preprocessing();
150 
151 
152  if ((f = fopen(base_filename.c_str(), "r")) == NULL) {
153  logstream(LOG_ERROR) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl;
154  exit(1);
155  }
156 
157 
158  if (mm_read_banner(f, &matcode) != 0)
159  {
160  logstream(LOG_ERROR) << "Could not process Matrix Market banner. File: " << base_filename << std::endl;
161  logstream(LOG_ERROR) << "Matrix must be in the Matrix Market format. " << std::endl;
162  exit(1);
163  }
164 
165 
166  /* This is how one can screen matrix types if their application */
167  /* only supports a subset of the Matrix Market data types. */
168 
169  if (mm_is_complex(matcode) || !mm_is_sparse(matcode))
170  {
171  logstream(LOG_ERROR) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl;
172  logstream(LOG_ERROR) << "Market Market type: " << mm_typecode_to_str(matcode) << std::endl;
173  exit(1);
174  }
175 
176  /* find out size of sparse matrix .... */
177 
178  if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) {
179  logstream(LOG_ERROR) << "Failed reading matrix size: error=" << ret_code << std::endl;
180  exit(1);
181  }
182 
183 
184  logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: "
185  << M << " x " << N << ", non-zeros: " << nz << std::endl;
186 
187  if (M < 5 || N < 5 || nz < 10) {
188  logstream(LOG_ERROR) << "File is suspiciously small. Something wrong? File: " << base_filename << std::endl;
189  assert(M < 5 || N < 5 || nz < 10);
190  }
191 
192 
193  if (!sharderobj.preprocessed_file_exists()) {
194  for (int i=0; i<nz; i++)
195  {
196  int I, J;
197  double val;
198  fscanf(f, "%d %d %lg\n", &I, &J, &val);
199  I--; /* adjust from 1-based to 0-based */
200  J--;
201 
202  sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val));
203  }
204  sharderobj.end_preprocessing();
205 
206  } else {
207  logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl;
208  }
209  if (f !=stdin) fclose(f);
210 
211 
212  logstream(LOG_INFO) << "Now creating shards." << std::endl;
213 
214  // Shard with a specified number of shards, or determine automatically if not defined
215  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));
216 
217  return nshards;
218 }
219 
220 struct MMOutputter : public VCallback<latentvec_t> {
221  FILE * outf;
222  MMOutputter(std::string fname, vid_t nvertices) {
223  MM_typecode matcode;
224  mm_initialize_typecode(&matcode);
225  mm_set_matrix(&matcode);
226  mm_set_array(&matcode);
227  mm_set_real(&matcode);
228 
229  outf = fopen(fname.c_str(), "w");
230  assert(outf != NULL);
231  mm_write_banner(outf, matcode);
232  mm_write_mtx_array_size(outf, NLATENT, nvertices); // Column major
233  }
234 
235  void callback(vid_t vertex_id, latentvec_t &vec) {
236  for(int i=0; i < NLATENT; i++) {
237  fprintf(outf, "%lf\n", vec.d[i]);
238  }
239  }
240 
241  ~MMOutputter() {
242  if (outf != NULL) fclose(outf);
243  }
244 
245 };
246 
247 void output_als_result(std::string filename, vid_t numvertices, vid_t max_left_vertex) {
248  MMOutputter mmoutput_left(filename + "_U.mm", max_left_vertex + 1);
249  foreach_vertices<latentvec_t>(filename, 0, max_left_vertex + 1, mmoutput_left);
250 
251 
252  MMOutputter mmoutput_right(filename + "_V.mm", numvertices - max_left_vertex - 1);
253  foreach_vertices<latentvec_t>(filename, max_left_vertex + 1, numvertices, mmoutput_right);
254  logstream(LOG_INFO) << "ALS output files (in matrix market format): " << filename + "_U.mm" <<
255  ", " << filename + "_V.mm" << std::endl;
256 }
257 
258 
259 
260 #endif