SHOGUN  v3.2.0
NGramTokenizer.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
12 #include <shogun/base/Parameter.h>
13 
14 namespace shogun
15 {
16 
18 {
19  n = ns;
20  last_idx = 0;
21  init();
22 }
23 
25 : CTokenizer(orig)
26 {
28  n = orig.n;
29  init();
30 }
31 
32 void CNGramTokenizer::init()
33 {
34  SG_ADD(&n, "n", "Size of n-grams",
36  SG_ADD(&last_idx, "last_idx", "Index of last token",
38 }
39 
41 {
42  last_idx = 0;
44 }
45 
46 const char* CNGramTokenizer::get_name() const
47 {
48  return "NGramTokenizer";
49 }
50 
52 {
53  return last_idx<=text.size()-n;
54 }
55 
57 {
58  start = last_idx++;
59  return start + n;
60 }
61 
63 {
65  return t;
66 }
67 }
The class CNGramTokenizer is used to tokenize a SGVector<char> into n-grams.
virtual const char * get_name() const
int32_t index_t
Definition: common.h:60
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17
SGVector< char > text
Definition: Tokenizer.h:71
int32_t size() const
Definition: SGVector.h:54
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:27
virtual void set_text(SGVector< char > txt)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:16
virtual index_t next_token_idx(index_t &start)
CNGramTokenizer(int32_t ns=3)
#define SG_ADD(...)
Definition: SGObject.h:71
virtual CNGramTokenizer * get_copy()

SHOGUN Machine Learning Toolbox - Documentation