RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1 //
2 // This file is part of the CCTBX distribution:
3 // http://cctbx.sourceforge.net/
4 // Downloaded from here:
5 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6 //
7 // Copyright (c) 2006, The Regents of the University of
8 // California, through Lawrence Berkeley National Laboratory (subject to
9 // receipt of any required approvals from the U.S. Dept. of Energy). All
10 // rights reserved.
11 //
12 // The license is here:
13 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14 //
15 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
16 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17 
18 #include <boost/python/object.hpp>
19 #include <boost/python/str.hpp>
20 #include <boost/python/extract.hpp>
21 
22 #include <boost/optional.hpp>
23 #include <boost/utility/typed_in_place_factory.hpp>
24 
25 //#include <tbxx/error_utils.hpp>
26 #include <RDGeneral/Invariant.h>
27 
28 #include <streambuf>
29 #include <iostream>
30 
31 namespace boost_adaptbx { namespace python {
32 
33 namespace bp = boost::python;
34 
35 /// A stream buffer getting data from and putting data into a Python file object
36 /** The aims are as follow:
37 
38  - Given a C++ function acting on a standard stream, e.g.
39 
40  \code
41  void read_inputs(std::istream& input) {
42  ...
43  input >> something >> something_else;
44  }
45  \endcode
46 
47  and given a piece of Python code which creates a file-like object,
48  to be able to pass this file object to that C++ function, e.g.
49 
50  \code
51  import gzip
52  gzip_file_obj = gzip.GzipFile(...)
53  read_inputs(gzip_file_obj)
54  \endcode
55 
56  and have the standard stream pull data from and put data into the Python
57  file object.
58 
59  - When Python \c read_inputs() returns, the Python object is able to
60  continue reading or writing where the C++ code left off.
61 
62  - Operations in C++ on mere files should be competitively fast compared
63  to the direct use of \c std::fstream.
64 
65 
66  \b Motivation
67 
68  - the standard Python library offer of file-like objects (files,
69  compressed files and archives, network, ...) is far superior to the
70  offer of streams in the C++ standard library and Boost C++ libraries.
71 
72  - i/o code involves a fair amount of text processing which is more
73  efficiently prototyped in Python but then one may need to rewrite
74  a time-critical part in C++, in as seamless a manner as possible.
75 
76  \b Usage
77 
78  This is 2-step:
79 
80  - a trivial wrapper function
81 
82  \code
83  using boost_adaptbx::python::streambuf;
84  void read_inputs_wrapper(streambuf& input)
85  {
86  streambuf::istream is(input);
87  read_inputs(is);
88  }
89 
90  def("read_inputs", read_inputs_wrapper);
91  \endcode
92 
93  which has to be written every time one wants a Python binding for
94  such a C++ function.
95 
96  - the Python side
97 
98  \code
99  from boost.python import streambuf
100  read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
101  \endcode
102 
103  \c buffer_size is optional. See also: \c default_buffer_size
104 
105  Note: references are to the C++ standard (the numbers between parentheses
106  at the end of references are margin markers).
107 */
108 class streambuf : public std::basic_streambuf<char>
109 {
110  private:
111  typedef std::basic_streambuf<char> base_t;
112 
113  public:
114  /* The syntax
115  using base_t::char_type;
116  would be nicer but Visual Studio C++ 8 chokes on it
117  */
118  typedef base_t::char_type char_type;
119  typedef base_t::int_type int_type;
120  typedef base_t::pos_type pos_type;
121  typedef base_t::off_type off_type;
122  typedef base_t::traits_type traits_type;
123 
124  // work around Visual C++ 7.1 problem
125  inline static int
126  traits_type_eof() { return traits_type::eof(); }
127 
128  /// The default size of the read and write buffer.
129  /** They are respectively used to buffer data read from and data written to
130  the Python file object. It can be modified from Python.
131  */
132  const static std::size_t default_buffer_size=1024;
133 
134  /// Construct from a Python file object
135  /** if buffer_size is 0 the current default_buffer_size is used.
136  */
138  bp::object& python_file_obj,
139  std::size_t buffer_size_=0)
140  :
141  py_read (getattr(python_file_obj, "read", bp::object())),
142  py_write(getattr(python_file_obj, "write", bp::object())),
143  py_seek (getattr(python_file_obj, "seek", bp::object())),
144  py_tell (getattr(python_file_obj, "tell", bp::object())),
145  buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
146  write_buffer(0),
147  pos_of_read_buffer_end_in_py_file(0),
148  pos_of_write_buffer_end_in_py_file(buffer_size),
149  farthest_pptr(0)
150  {
151  TEST_ASSERT(buffer_size != 0);
152  /* Some Python file objects (e.g. sys.stdout and sys.stdin)
153  have non-functional seek and tell. If so, assign None to
154  py_tell and py_seek.
155  */
156  if (py_tell != bp::object()) {
157  try {
158  off_type py_pos = bp::extract<off_type>(py_tell());
159  if(py_seek != bp::object()){
160  /* Make sure we can actually seek.
161  bzip2 readers from python have a seek method, but it fails
162  when they are in write mode.
163  */
164  py_seek(py_pos);
165  }
166  }
167  catch (bp::error_already_set&) {
168  py_tell = bp::object();
169  py_seek = bp::object();
170  /* Boost.Python does not do any Python exception handling whatsoever
171  So we need to catch it by hand like so.
172  */
173  PyErr_Clear();
174  }
175  }
176 
177  if (py_write != bp::object()) {
178  // C-like string to make debugging easier
179  write_buffer = new char[buffer_size + 1];
180  write_buffer[buffer_size] = '\0';
181  setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
182  farthest_pptr = pptr();
183  }
184  else {
185  // The first attempt at output will result in a call to overflow
186  setp(0, 0);
187  }
188 
189  if (py_tell != bp::object()) {
190  off_type py_pos = bp::extract<off_type>(py_tell());
191  pos_of_read_buffer_end_in_py_file = py_pos;
192  pos_of_write_buffer_end_in_py_file = py_pos;
193  }
194  }
195 
196  /// Mundane destructor freeing the allocated resources
197  virtual ~streambuf() {
198  if (write_buffer) delete[] write_buffer;
199  }
200 
201  /// C.f. C++ standard section 27.5.2.4.3
202  /** It is essential to override this virtual function for the stream
203  member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
204  */
205  virtual std::streamsize showmanyc() {
206  int_type const failure = traits_type::eof();
207  int_type status = underflow();
208  if (status == failure) return -1;
209  return egptr() - gptr();
210  }
211 
212  /// C.f. C++ standard section 27.5.2.4.3
213  virtual int_type underflow() {
214  int_type const failure = traits_type::eof();
215  if (py_read == bp::object()) {
216  throw std::invalid_argument(
217  "That Python file object has no 'read' attribute");
218  }
219  read_buffer = py_read(buffer_size);
220  char *read_buffer_data;
221  bp::ssize_t py_n_read;
222  if (PyBytes_AsStringAndSize(read_buffer.ptr(),
223  &read_buffer_data, &py_n_read) == -1) {
224  setg(0, 0, 0);
225  throw std::invalid_argument(
226  "The method 'read' of the Python file object "
227  "did not return a string.");
228  }
229  off_type n_read = (off_type)py_n_read;
230  pos_of_read_buffer_end_in_py_file += n_read;
231  setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
232  // ^^^27.5.2.3.1 (4)
233  if (n_read == 0) return failure;
234  return traits_type::to_int_type(read_buffer_data[0]);
235  }
236 
237  /// C.f. C++ standard section 27.5.2.4.5
238  virtual int_type overflow(int_type c=traits_type_eof()) {
239  if (py_write == bp::object()) {
240  throw std::invalid_argument(
241  "That Python file object has no 'write' attribute");
242  }
243  farthest_pptr = std::max(farthest_pptr, pptr());
244  off_type n_written = (off_type)(farthest_pptr - pbase());
245  bp::str chunk(pbase(), farthest_pptr);
246  py_write(chunk);
247  if (!traits_type::eq_int_type(c, traits_type::eof())) {
248  py_write(traits_type::to_char_type(c));
249  n_written++;
250  }
251  if (n_written) {
252  pos_of_write_buffer_end_in_py_file += n_written;
253  setp(pbase(), epptr());
254  // ^^^ 27.5.2.4.5 (5)
255  farthest_pptr = pptr();
256  }
257  return traits_type::eq_int_type(
258  c, traits_type::eof()) ? traits_type::not_eof(c) : c;
259  }
260 
261  /// Update the python file to reflect the state of this stream buffer
262  /** Empty the write buffer into the Python file object and set the seek
263  position of the latter accordingly (C++ standard section 27.5.2.4.2).
264  If there is no write buffer or it is empty, but there is a non-empty
265  read buffer, set the Python file object seek position to the
266  seek position in that read buffer.
267  */
268  virtual int sync() {
269  int result = 0;
270  farthest_pptr = std::max(farthest_pptr, pptr());
271  if (farthest_pptr && farthest_pptr > pbase()) {
272  off_type delta = pptr() - farthest_pptr;
273  int_type status = overflow();
274  if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
275  if (py_seek != bp::object()) py_seek(delta, 1);
276  }
277  else if (gptr() && gptr() < egptr()) {
278  if (py_seek != bp::object()) py_seek(gptr() - egptr(), 1);
279  }
280  return result;
281  }
282 
283  /// C.f. C++ standard section 27.5.2.4.2
284  /** This implementation is optimised to look whether the position is within
285  the buffers, so as to avoid calling Python seek or tell. It is
286  important for many applications that the overhead of calling into Python
287  is avoided as much as possible (e.g. parsers which may do a lot of
288  backtracking)
289  */
290  virtual
291  pos_type seekoff(off_type off, std::ios_base::seekdir way,
292  std::ios_base::openmode which= std::ios_base::in
293  | std::ios_base::out)
294  {
295  /* In practice, "which" is either std::ios_base::in or out
296  since we end up here because either seekp or seekg was called
297  on the stream using this buffer. That simplifies the code
298  in a few places.
299  */
300  int const failure = off_type(-1);
301 
302  if (py_seek == bp::object()) {
303  throw std::invalid_argument(
304  "That Python file object has no 'seek' attribute");
305  }
306 
307  // we need the read buffer to contain something!
308  if (which == std::ios_base::in && !gptr()) {
309  if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
310  return failure;
311  }
312  }
313 
314  // compute the whence parameter for Python seek
315  int whence;
316  switch (way) {
317  case std::ios_base::beg:
318  whence = 0;
319  break;
320  case std::ios_base::cur:
321  whence = 1;
322  break;
323  case std::ios_base::end:
324  whence = 2;
325  break;
326  default:
327  return failure;
328  }
329 
330  // Let's have a go
331  boost::optional<off_type> result = seekoff_without_calling_python(
332  off, way, which);
333  if (!result) {
334  // we need to call Python
335  if (which == std::ios_base::out) overflow();
336  if (way == std::ios_base::cur) {
337  if (which == std::ios_base::in) off -= egptr() - gptr();
338  else if (which == std::ios_base::out) off += pptr() - pbase();
339  }
340  py_seek(off, whence);
341  result = off_type(bp::extract<off_type>(py_tell()));
342  if (which == std::ios_base::in) underflow();
343  }
344  return *result;
345  }
346 
347  /// C.f. C++ standard section 27.5.2.4.2
348  virtual
349  pos_type seekpos(pos_type sp,
350  std::ios_base::openmode which= std::ios_base::in
351  | std::ios_base::out)
352  {
353  return streambuf::seekoff(sp, std::ios_base::beg, which);
354  }
355 
356  private:
357  bp::object py_read, py_write, py_seek, py_tell;
358 
359  std::size_t buffer_size;
360 
361  /* This is actually a Python string and the actual read buffer is
362  its internal data, i.e. an array of characters. We use a Boost.Python
363  object so as to hold on it: as a result, the actual buffer can't
364  go away.
365  */
366  bp::object read_buffer;
367 
368  /* A mere array of char's allocated on the heap at construction time and
369  de-allocated only at destruction time.
370  */
371  char *write_buffer;
372 
373  off_type pos_of_read_buffer_end_in_py_file,
374  pos_of_write_buffer_end_in_py_file;
375 
376  // the farthest place the buffer has been written into
377  char *farthest_pptr;
378 
379 
380  boost::optional<off_type> seekoff_without_calling_python(
381  off_type off,
382  std::ios_base::seekdir way,
383  std::ios_base::openmode which)
384  {
385  boost::optional<off_type> const failure;
386 
387  // Buffer range and current position
388  off_type buf_begin, buf_end, buf_cur, upper_bound;
389  off_type pos_of_buffer_end_in_py_file;
390  if (which == std::ios_base::in) {
391  pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
392  buf_begin = reinterpret_cast<std::streamsize>(eback());
393  buf_cur = reinterpret_cast<std::streamsize>(gptr());
394  buf_end = reinterpret_cast<std::streamsize>(egptr());
395  upper_bound = buf_end;
396  }
397  else if (which == std::ios_base::out) {
398  pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
399  buf_begin = reinterpret_cast<std::streamsize>(pbase());
400  buf_cur = reinterpret_cast<std::streamsize>(pptr());
401  buf_end = reinterpret_cast<std::streamsize>(epptr());
402  farthest_pptr = std::max(farthest_pptr, pptr());
403  upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
404  }
405  else {
406  CHECK_INVARIANT(0,"unreachable code");
407  }
408 
409  // Sought position in "buffer coordinate"
410  off_type buf_sought;
411  if (way == std::ios_base::cur) {
412  buf_sought = buf_cur + off;
413  }
414  else if (way == std::ios_base::beg) {
415  buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
416  }
417  else if (way == std::ios_base::end) {
418  return failure;
419  }
420  else {
421  CHECK_INVARIANT(0,"unreachable code");
422  }
423 
424  // if the sought position is not in the buffer, give up
425  if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
426 
427  // we are in wonderland
428  if (which == std::ios_base::in) gbump(buf_sought - buf_cur);
429  else if (which == std::ios_base::out) pbump(buf_sought - buf_cur);
430  return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
431  }
432 
433  public:
434 
435  class istream : public std::istream
436  {
437  public:
438  istream(streambuf& buf) : std::istream(&buf)
439  {
440  exceptions(std::ios_base::badbit);
441  }
442 
443  ~istream() { if (this->good()) this->sync(); }
444  };
445 
446  class ostream : public std::ostream
447  {
448  public:
449  ostream(streambuf& buf) : std::ostream(&buf)
450  {
451  exceptions(std::ios_base::badbit);
452  }
453 
454  ~ostream() { if (this->good()) this->flush(); }
455  };
456 };
457 
458  //std::size_t streambuf::default_buffer_size = 1024;
459 
461 {
463 
465  bp::object& python_file_obj,
466  std::size_t buffer_size=0)
467  :
468  python_streambuf(python_file_obj, buffer_size)
469  {}
470 };
471 
473 {
475  bp::object& python_file_obj,
476  std::size_t buffer_size=0)
477  :
478  streambuf_capsule(python_file_obj, buffer_size),
479  streambuf::ostream(python_streambuf)
480  {}
481 
483  {
484  try {
485  if (this->good()) this->flush();
486  }
487  catch (bp::error_already_set&) {
488  PyErr_Clear();
489  throw std::runtime_error(
490  "Problem closing python ostream.\n"
491  " Known limitation: the error is unrecoverable. Sorry.\n"
492  " Suggestion for programmer: add ostream.flush() before"
493  " returning.");
494  }
495  }
496 };
497 
498 }} // boost_adaptbx::python
499 
500 #endif // GUARD
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)
virtual int sync()
Update the python file to reflect the state of this stream buffer.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
static const std::size_t default_buffer_size
The default size of the read and write buffer.
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:114
STL namespace.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
virtual int_type underflow()
C.f. C++ standard section 27.5.2.4.3.
virtual int_type overflow(int_type c=traits_type_eof())
C.f. C++ standard section 27.5.2.4.5.
#define TEST_ASSERT(expr)
Definition: Invariant.h:139
virtual pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual std::streamsize showmanyc()
C.f. C++ standard section 27.5.2.4.3.
virtual ~streambuf()
Mundane destructor freeing the allocated resources.
A stream buffer getting data from and putting data into a Python file object.