SHOGUN  v3.2.0
HashedDenseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
12 #include <shogun/base/Parameter.h>
13 #include <shogun/lib/Hash.h>
15 #include <shogun/io/SGIO.h>
17 
18 #include <string.h>
19 
20 namespace shogun {
21 template <class ST>
22 CHashedDenseFeatures<ST>::CHashedDenseFeatures(int32_t size, bool use_quadr, bool keep_lin_terms)
23 : CDotFeatures(size)
24 {
25  init(NULL, 0, use_quadr, keep_lin_terms);
26 }
27 
28 template <class ST>
30  bool use_quadr, bool keep_lin_terms) : CDotFeatures()
31 {
32  init(feats, d, use_quadr, keep_lin_terms);
33 }
34 
35 template <class ST>
37  bool keep_lin_terms) : CDotFeatures()
38 {
39  CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(matrix);
40  init(feats, d, use_quadr, keep_lin_terms);
41 }
42 
43 template <class ST>
44 CHashedDenseFeatures<ST>::CHashedDenseFeatures(ST* src, int32_t num_feat, int32_t num_vec,
45  int32_t d, bool use_quadr, bool keep_lin_terms) : CDotFeatures()
46 {
47  CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(src, num_feat, num_vec);
48  init(feats, d, use_quadr, keep_lin_terms);
49 }
50 
51 template <class ST>
52 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CFile* loader, int32_t d, bool use_quadr,
53  bool keep_lin_terms) : CDotFeatures(loader)
54 {
56  feats->load(loader);
57  init(feats, d, use_quadr, keep_lin_terms);
58 }
59 
60 template <class ST>
61 void CHashedDenseFeatures<ST>::init(CDenseFeatures<ST>* feats, int32_t d, bool use_quadr,
62  bool keep_lin_terms)
63 {
64  dim = d;
65  dense_feats = feats;
67  use_quadratic = use_quadr;
68  keep_linear_terms = keep_lin_terms;
69 
70  SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
72  SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
74  SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
75  SG_ADD((CSGObject** ) &dense_feats, "dense_feats", "Dense features to work on",
77 
78  set_generic<ST>();
79 }
80 
81 template <class ST>
83 : CDotFeatures(orig)
84 {
85  init(orig.dense_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
86 }
87 
88 template <class ST>
90 {
92 }
93 
94 template <class ST>
96 {
97  return new CHashedDenseFeatures<ST>(*this);
98 }
99 
100 template <class ST>
102 {
103  return dim;
104 }
105 
106 template <class ST>
108  int32_t vec_idx2)
109 {
110  ASSERT(df)
113  ASSERT(strcmp(df->get_name(), get_name())==0)
114 
117 
119 
120  bool same_vec = (df == this) && (vec_idx1 == vec_idx2);
121  SGSparseVector<ST> vec_2 = same_vec ? vec_1 : feats->get_hashed_feature_vector(vec_idx2);
122  float64_t result = vec_1.sparse_dot(vec_2);
123 
124  return result;
125 }
126 
127 template <class ST>
129  int32_t vec2_len)
130 {
131  ASSERT(vec2_len == dim)
132 
133  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
134 
135  float64_t result = 0;
136 
137  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
138  SGVector<uint32_t> hash_cache(hash_cache_size);
139 
140  for (index_t i=0; i<vec.vlen; i++)
141  {
142  uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
143  if (use_quadratic)
144  hash_cache[i] = h_idx;
145 
146  if ( (!use_quadratic) || keep_linear_terms)
147  result += vec2[h_idx % dim] * vec[i];
148  }
149 
150  if (use_quadratic)
151  {
152  for (index_t i=0; i<vec.size(); i++)
153  {
154  int32_t n_idx = i * vec.size() + i;
155  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
156  result += vec2[idx] * vec[i] * vec[i];
157 
158  for (index_t j=i+1; j<vec.size(); j++)
159  {
160  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
161  result += vec2[idx] * vec[i] * vec[j];
162  }
163  }
164  }
165 
166  dense_feats->free_feature_vector(vec, vec_idx1);
167  return result;
168 }
169 
170 template <class ST>
172  float64_t* vec2, int32_t vec2_len, bool abs_val)
173 {
174  float64_t val = abs_val ? CMath::abs(alpha) : alpha;
175  ASSERT(vec2_len == dim)
176 
177  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
178 
179  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
180  SGVector<uint32_t> hash_cache(hash_cache_size);
181 
182  for (index_t i=0; i<vec.vlen; i++)
183  {
184  uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
185 
186  if (use_quadratic)
187  hash_cache[i] = h_idx;
188 
189  if ( (!use_quadratic) || keep_linear_terms)
190  vec2[h_idx % dim] += val * vec[i];
191  }
192 
193  if (use_quadratic)
194  {
195  for (index_t i=0; i<vec.size(); i++)
196  {
197  int32_t n_idx = i * vec.size() + i;
198  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
199  vec2[idx] += val * vec[i] * vec[i];
200 
201  for (index_t j=i+1; j<vec.size(); j++)
202  {
203  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
204  vec2[idx] += val * vec[i] * vec[j];
205  }
206  }
207  }
208  dense_feats->free_feature_vector(vec, vec_idx1);
209 }
210 
211 template <class ST>
213 {
214  return dim;
215 }
216 
217 template <class ST>
219 {
221  return NULL;
222 }
223 template <class ST>
225  void* iterator)
226 {
228  return false;
229 }
230 template <class ST>
232 {
234 }
235 
236 template <class ST>
238 {
239  return "HashedDenseFeatures";
240 }
241 
242 template <class ST>
244 {
245  return F_UINT;
246 }
247 
248 template <class ST>
250 {
251  return C_SPARSE;
252 }
253 
254 template <class ST>
256 {
257  return dense_feats->get_num_vectors();
258 }
259 
260 template <class ST>
262 {
263  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx);
266  dense_feats->free_feature_vector(vec, vec_idx);
267  return hashed_vec;
268 }
269 
270 template <class ST>
272  bool use_quadratic, bool keep_linear_terms)
273 {
274  SGVector<ST> h_vec(dim);
275  SGVector<ST>::fill_vector(h_vec.vector, dim, 0);
276 
277  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
278  SGVector<uint32_t> hash_cache(hash_cache_size);
279 
280  for (index_t i=0; i<vec.size(); i++)
281  {
282  uint32_t hash = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
283  if (use_quadratic)
284  hash_cache[i] = hash;
285 
286  if ( (!use_quadratic) || keep_linear_terms)
287  h_vec[hash % dim] += vec[i];
288  }
289 
290  if (use_quadratic)
291  {
292  for (index_t i=0; i<vec.size(); i++)
293  {
294  index_t n_idx = i * vec.size() + i;
295  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
296  h_vec[idx] += vec[i] * vec[i];
297 
298  for (index_t j=i+1; j<vec.size(); j++)
299  {
300  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
301  h_vec[idx] += vec[i] * vec[j];
302  }
303  }
304  }
305 
306  int32_t num_nnz_feats = 0;
307  for(index_t i=0; i<dim; i++)
308  {
309  if (h_vec[i]!=0)
310  num_nnz_feats++;
311  }
312 
313  SGSparseVector<ST> hashed_vector(num_nnz_feats);
314 
315  int32_t sparse_feat_index = 0;
316  for (index_t i=0; i<dim; i++)
317  {
318  if (h_vec[i]!=0)
319  {
320  hashed_vector.features[sparse_feat_index].feat_index = i;
321  hashed_vector.features[sparse_feat_index++].entry = h_vec[i];
322  }
323  }
324 
325  return hashed_vector;
326 }
327 
328 template class CHashedDenseFeatures<bool>;
329 template class CHashedDenseFeatures<char>;
330 template class CHashedDenseFeatures<int8_t>;
331 template class CHashedDenseFeatures<uint8_t>;
332 template class CHashedDenseFeatures<int16_t>;
333 template class CHashedDenseFeatures<uint16_t>;
334 template class CHashedDenseFeatures<int32_t>;
335 template class CHashedDenseFeatures<uint32_t>;
336 template class CHashedDenseFeatures<int64_t>;
337 template class CHashedDenseFeatures<uint64_t>;
338 template class CHashedDenseFeatures<float32_t>;
339 template class CHashedDenseFeatures<float64_t>;
340 template class CHashedDenseFeatures<floatmax_t>;
341 }
virtual const char * get_name() const =0
CDenseFeatures< ST > * dense_feats
int32_t index_t
Definition: common.h:60
#define SG_UNREF(x)
Definition: SGRefObject.h:35
static SGSparseVector< ST > hash_vector(SGVector< ST > vec, int32_t dim, bool use_quadratic=false, bool keep_linear_terms=true)
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:141
This class is identical to the CDenseFeatures class except that it hashes each dimension to a new fea...
T sparse_dot(const SGSparseVector< T > &v)
virtual CFeatures * duplicate() const
Features that support dot products among other operations.
Definition: DotFeatures.h:41
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:35
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:365
virtual const char * get_name() const
#define ASSERT(x)
Definition: SGIO.h:203
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:102
int32_t size() const
Definition: SGVector.h:54
double float64_t
Definition: common.h:48
#define SG_REF(x)
Definition: SGRefObject.h:34
A File access base class.
Definition: File.h:34
SGSparseVector< ST > get_hashed_feature_vector(int32_t vec_idx)
virtual EFeatureClass get_feature_class() const =0
CHashedDenseFeatures(int32_t size=0, bool use_quadr=false, bool keep_lin_terms=true)
virtual int32_t get_num_vectors() const
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:271
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
virtual EFeatureClass get_feature_class() const
virtual void free_feature_iterator(void *iterator)
EFeatureType
shogun feature type
Definition: FeatureTypes.h:16
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
The class DenseFeatures implements dense feature matrices.
Definition: LDA.h:24
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:16
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
virtual int32_t get_dim_feature_space() const
The class Features is the base class of all feature objects.
Definition: Features.h:62
virtual int32_t get_nnz_features_for_vector(int32_t num)
virtual EFeatureType get_feature_type() const
virtual void load(CFile *loader)
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
#define SG_ADD(...)
Definition: SGObject.h:71
virtual void * get_feature_iterator(int32_t vector_index)
virtual EFeatureType get_feature_type() const =0
index_t vlen
Definition: SGVector.h:706
static T abs(T a)
return the absolute value of a number
Definition: Math.h:179
SGSparseVectorEntry< T > * features

SHOGUN Machine Learning Toolbox - Documentation