35 "SmoothedNgramPredictor, a linear interpolating n-gram predictor",
36 "SmoothedNgramPredictor, long description." ),
39 learn_mode_set (false),
81 std::stringstream ss_deltas(value);
84 while (ss_deltas >> delta) {
85 logger << DEBUG <<
"Pushing delta: " << delta <<
endl;
142 const char separator[] =
"|";
143 std::string result = separator;
145 for (Ngram::const_iterator it = ngram.begin();
149 result += *it + separator;
173 unsigned int result = 0;
176 assert(ngram_size >= 0);
178 if (ngram_size > 0) {
179 Ngram ngram(ngram_size);
180 copy(tokens.end() - ngram_size + offset , tokens.end() + offset, ngram.begin());
185 logger << DEBUG <<
"unigram counts sum: " << result <<
endl;
205 logger << DEBUG <<
"Cached tokens[" << cardinality - 1 - i <<
"] = " << tokens[cardinality - 1 - i] <<
endl;
220 std::vector<std::string> prefixCompletionCandidates;
221 for (
size_t k = cardinality; (k > 0 && prefixCompletionCandidates.size() < max_partial_prediction_size); k--) {
222 logger << DEBUG <<
"Building partial prefix completion table of cardinality: " << k <<
endl;
224 Ngram prefix_ngram(k);
225 copy(tokens.end() - k, tokens.end(), prefix_ngram.begin());
228 logger << DEBUG <<
"prefix_ngram: ";
229 for (
size_t r = 0; r < prefix_ngram.size(); r++) {
230 logger << DEBUG << prefix_ngram[r] <<
' ';
241 partial =
db->
getNgramLikeTable(prefix_ngram,max_partial_prediction_size - prefixCompletionCandidates.size());
249 logger << DEBUG <<
"partial prefixCompletionCandidates" << endl
250 << DEBUG <<
"----------------------------------" <<
endl;
251 for (
size_t j = 0; j < partial.size(); j++) {
252 for (
size_t k = 0; k < partial[j].size(); k++) {
253 logger << DEBUG << partial[j][k] <<
" ";
259 logger << DEBUG <<
"Partial prefix completion table contains " << partial.size() <<
" potential completions." <<
endl;
265 std::vector<Ngram>::const_iterator it = partial.begin();
266 while (it != partial.end() && prefixCompletionCandidates.size() < max_partial_prediction_size) {
270 std::string candidate = *(it->end() - 2);
271 if (find(prefixCompletionCandidates.begin(),
272 prefixCompletionCandidates.end(),
273 candidate) == prefixCompletionCandidates.end()) {
274 prefixCompletionCandidates.push_back(candidate);
281 logger << DEBUG <<
"prefixCompletionCandidates" << endl
282 << DEBUG <<
"--------------------------" <<
endl;
283 for (
size_t j = 0; j < prefixCompletionCandidates.size(); j++) {
284 logger << DEBUG << prefixCompletionCandidates[j] <<
endl;
294 for (
size_t j = 0; (j < prefixCompletionCandidates.size() && j < max_partial_prediction_size); j++) {
296 tokens[cardinality - 1] = prefixCompletionCandidates[j];
298 logger << DEBUG <<
"------------------" <<
endl;
299 logger << DEBUG <<
"w_i: " << tokens[cardinality - 1] <<
endl;
301 double probability = 0;
303 double numerator =
count(tokens, 0, k+1);
305 double denominator = (k == 0 ? unigrams_counts_sum :
count(tokens, -1, k));
306 double frequency = ((denominator > 0) ? (numerator / denominator) : 0);
307 probability +=
deltas[k] * frequency;
309 logger << DEBUG <<
"numerator: " << numerator <<
endl;
310 logger << DEBUG <<
"denominator: " << denominator <<
endl;
311 logger << DEBUG <<
"frequency: " << frequency <<
endl;
315 assert(numerator <= denominator);
316 assert(frequency <= 1);
320 logger << DEBUG <<
"probability: " << probability <<
endl;
322 if (probability > 0) {
342 std::map<std::list<std::string>,
int> ngramMap;
346 for (
size_t curr_cardinality = 1;
351 int change_size = change.size();
353 std::list<std::string> ngram_list;
357 (i < curr_cardinality - 1 && change_idx < change_size);
360 ngram_list.push_back(change[change_idx]);
364 while (change_idx < change_size)
366 ngram_list.push_back(change[change_idx++]);
367 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
368 ngram_list.pop_front();
391 if (change.size() > 0 &&
396 std::list<std::string> ngram_list(change.begin(), change.begin() + 1);
412 logger << DEBUG <<
"Adding extra token: " << extra_token <<
endl;
414 if (extra_token.empty())
418 ngram_list.push_front(extra_token);
420 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
429 std::map<std::list<std::string>,
int>::const_iterator it;
430 for (it = ngramMap.begin(); it != ngramMap.end(); it++)
433 Ngram ngram((it->first).begin(), (it->first).end());
451 logger << INFO <<
"Committed learning update to database" <<
endl;
456 logger << ERROR <<
"Rolling back learning update : " << ex.
what() <<
endl;
472 size_t size = ngram.size();
473 for (
size_t i = 0; i < size; i++) {
474 if (
count(ngram, -i, size - i) >
count(ngram, -(i + 1), size - (i + 1))) {
475 logger << INFO <<
"consistency adjustment needed!" <<
endl;
477 int offset = -(i + 1);
478 int sub_ngram_size = size - (i + 1);
480 logger << DEBUG <<
"i: " << i <<
" | offset: " << offset <<
" | sub_ngram_size: " << sub_ngram_size <<
endl;
482 Ngram sub_ngram(sub_ngram_size);
483 copy(ngram.end() - sub_ngram_size + offset, ngram.end() + offset, sub_ngram.begin());
486 logger <<
"ngram to be count adjusted is: ";
487 for (
size_t i = 0; i < sub_ngram.size(); i++) {
488 logger << sub_ngram[i] <<
' ';
494 logger << DEBUG <<
"consistency adjusted" <<
endl;
void dispatch(const Observable *var)
virtual void beginTransaction() const
~SmoothedNgramPredictor()
void check_learn_consistency(const Ngram &name) const
Dispatcher< SmoothedNgramPredictor > dispatcher
NgramTable getNgramLikeTableFiltered(const Ngram ngram, const char **filter, int limit=-1) const
int getUnigramCountsSum() const
Variable * find(const std::string &variable) const
std::string getExtraTokenToLearn(const int index, const std::vector< std::string > &change) const
virtual void learn(const std::vector< std::string > &change)
int getNgramCount(const Ngram ngram) const
void set_database_logger_level(const std::string &level)
virtual Prediction predict(const size_t size, const char **filter) const
Generate prediction.
virtual void set_logger(const std::string &level)
unsigned int count(const std::vector< std::string > &tokens, int offset, int ngram_size) const
Builds the required n-gram and returns its count.
void set_deltas(const std::string &deltas)
const std::string PREDICTORS
static std::string ngram_to_string(const Ngram &ngram)
virtual void endTransaction() const
std::vector< double > deltas
void init_database_connector_if_ready()
void updateNgram(const Ngram ngram, const int count) const
static double toDouble(const std::string)
virtual void update(const Observable *variable)
std::vector< Ngram > NgramTable
void set_learn(const std::string &learn_mode)
virtual std::string get_name() const =0
SmoothedNgramPredictor(Configuration *, ContextTracker *, const char *)
void map(Observable *var, const mbr_func_ptr_t &ptr)
void insertNgram(const Ngram ngram, const int count) const
ContextTracker * contextTracker
std::string getToken(const int) const
void addSuggestion(Suggestion)
static bool isYes(const char *)
std::string DATABASE_LOGGER
Tracks user interaction and context.
virtual std::string get_value() const =0
virtual void rollbackTransaction() const
virtual const char * what() const
void set_dbfilename(const std::string &filename)
int incrementNgramCount(const Ngram ngram) const
NgramTable getNgramLikeTable(const Ngram ngram, int limit=-1) const
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)