SHOGUN  v3.2.0
DelimiterTokenizer.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
11 #ifndef _DELIMITERTOKENIZER__H__
12 #define _DELIMITERTOKENIZER__H__
13 
14 #include <shogun/lib/Tokenizer.h>
15 
16 namespace shogun
17 {
18 class CTokenizer;
19 
27 {
28 public:
33  CDelimiterTokenizer(bool skip_delimiters = false);
34 
40 
42  virtual ~CDelimiterTokenizer() {}
43 
48  virtual void set_text(SGVector<char> txt);
49 
55  virtual bool has_next();
56 
65  virtual index_t next_token_idx(index_t& start);
66 
72  virtual const char* get_name() const;
73 
77  void init_for_whitespace();
78 
80 
82  void clear_delimiters();
83 
88  bool get_skip_delimiters() const;
89 
94  void set_skip_delimiters(bool skip_delimiters);
95 
96 private:
97  void init();
98 
99 public:
102 
103 protected:
106 
109 };
110 }
111 #endif /* _WHITESPACETOKENIZER__H__ */
112 
int32_t index_t
Definition: common.h:60
virtual index_t next_token_idx(index_t &start)
void set_skip_delimiters(bool skip_delimiters)
CDelimiterTokenizer * get_copy()
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:27
virtual const char * get_name() const
CDelimiterTokenizer(bool skip_delimiters=false)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:16
The class CDelimiterTokenizer is used to tokenize a SGVector<char> into tokens using custom chars as ...
virtual void set_text(SGVector< char > txt)

SHOGUN Machine Learning Toolbox - Documentation