RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1 //
2 // This file is part of the CCTBX distribution:
3 // http://cctbx.sourceforge.net/
4 // Downloaded from here:
5 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6 //
7 // Copyright (c) 2006, The Regents of the University of
8 // California, through Lawrence Berkeley National Laboratory (subject to
9 // receipt of any required approvals from the U.S. Dept. of Energy). All
10 // rights reserved.
11 //
12 // The license is here:
13 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14 //
15 #include <RDGeneral/export.h>
16 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19 #include <boost/python/object.hpp>
20 #include <boost/python/str.hpp>
21 #include <boost/python/extract.hpp>
22 
23 #include <boost/optional.hpp>
24 #include <boost/utility/typed_in_place_factory.hpp>
26 
27 //#include <tbxx/error_utils.hpp>
28 #include <RDGeneral/Invariant.h>
29 
30 #include <streambuf>
31 #include <iostream>
32 
33 namespace boost_adaptbx {
34 namespace python {
35 
36 namespace bp = boost::python;
37 
38 /// A stream buffer getting data from and putting data into a Python file object
39 /** The aims are as follow:
40 
41  - Given a C++ function acting on a standard stream, e.g.
42 
43  \code
44  void read_inputs(std::istream& input) {
45  ...
46  input >> something >> something_else;
47  }
48  \endcode
49 
50  and given a piece of Python code which creates a file-like object,
51  to be able to pass this file object to that C++ function, e.g.
52 
53  \code
54  import gzip
55  gzip_file_obj = gzip.GzipFile(...)
56  read_inputs(gzip_file_obj)
57  \endcode
58 
59  and have the standard stream pull data from and put data into the Python
60  file object.
61 
62  - When Python \c read_inputs() returns, the Python object is able to
63  continue reading or writing where the C++ code left off.
64 
65  - Operations in C++ on mere files should be competitively fast compared
66  to the direct use of \c std::fstream.
67 
68 
69  \b Motivation
70 
71  - the standard Python library offer of file-like objects (files,
72  compressed files and archives, network, ...) is far superior to the
73  offer of streams in the C++ standard library and Boost C++ libraries.
74 
75  - i/o code involves a fair amount of text processing which is more
76  efficiently prototyped in Python but then one may need to rewrite
77  a time-critical part in C++, in as seamless a manner as possible.
78 
79  \b Usage
80 
81  This is 2-step:
82 
83  - a trivial wrapper function
84 
85  \code
86  using boost_adaptbx::python::streambuf;
87  void read_inputs_wrapper(streambuf& input)
88  {
89  streambuf::istream is(input);
90  read_inputs(is);
91  }
92 
93  def("read_inputs", read_inputs_wrapper);
94  \endcode
95 
96  which has to be written every time one wants a Python binding for
97  such a C++ function.
98 
99  - the Python side
100 
101  \code
102  from boost.python import streambuf
103  read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
104  \endcode
105 
106  \c buffer_size is optional. See also: \c default_buffer_size
107 
108  Note: references are to the C++ standard (the numbers between parentheses
109  at the end of references are margin markers).
110 */
111 class streambuf : public std::basic_streambuf<char> {
112  private:
113  typedef std::basic_streambuf<char> base_t;
114 
115  public:
116  /* The syntax
117  using base_t::char_type;
118  would be nicer but Visual Studio C++ 8 chokes on it
119  */
120  typedef base_t::char_type char_type;
121  typedef base_t::int_type int_type;
122  typedef base_t::pos_type pos_type;
123  typedef base_t::off_type off_type;
124  typedef base_t::traits_type traits_type;
125 
126  // work around Visual C++ 7.1 problem
127  inline static int traits_type_eof() { return traits_type::eof(); }
128 
129  /// The default size of the read and write buffer.
130  /** They are respectively used to buffer data read from and data written to
131  the Python file object. It can be modified from Python.
132  */
133  const static std::size_t default_buffer_size = 1024;
134 
135  /// Construct from a Python file object
136  /** if buffer_size is 0 the current default_buffer_size is used.
137  */
138  streambuf(bp::object& python_file_obj, std::size_t buffer_size_ = 0)
139  : py_read(getattr(python_file_obj, "read", bp::object())),
140  py_write(getattr(python_file_obj, "write", bp::object())),
141  py_seek(getattr(python_file_obj, "seek", bp::object())),
142  py_tell(getattr(python_file_obj, "tell", bp::object())),
143  buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
144  write_buffer(0),
145  pos_of_read_buffer_end_in_py_file(0),
146  pos_of_write_buffer_end_in_py_file(buffer_size),
147  farthest_pptr(0) {
148  TEST_ASSERT(buffer_size != 0);
149  /* Some Python file objects (e.g. sys.stdout and sys.stdin)
150  have non-functional seek and tell. If so, assign None to
151  py_tell and py_seek.
152  */
153  if (py_tell != bp::object()) {
154  try {
155  off_type py_pos = bp::extract<off_type>(py_tell());
156  if (py_seek != bp::object()) {
157  /* Make sure we can actually seek.
158  bzip2 readers from python have a seek method, but it fails
159  when they are in write mode.
160  */
161  py_seek(py_pos);
162  }
163  } catch (bp::error_already_set&) {
164  py_tell = bp::object();
165  py_seek = bp::object();
166  /* Boost.Python does not do any Python exception handling whatsoever
167  So we need to catch it by hand like so.
168  */
169  PyErr_Clear();
170  }
171  }
172 
173  if (py_write != bp::object()) {
174  // C-like string to make debugging easier
175  write_buffer = new char[buffer_size + 1];
176  write_buffer[buffer_size] = '\0';
177  setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
178  farthest_pptr = pptr();
179  } else {
180  // The first attempt at output will result in a call to overflow
181  setp(0, 0);
182  }
183 
184  if (py_tell != bp::object()) {
185  off_type py_pos = bp::extract<off_type>(py_tell());
186  pos_of_read_buffer_end_in_py_file = py_pos;
187  pos_of_write_buffer_end_in_py_file = py_pos;
188  }
189  }
190 
191  /// Mundane destructor freeing the allocated resources
192  virtual ~streambuf() {
193  if (write_buffer) delete[] write_buffer;
194  }
195 
196  /// C.f. C++ standard section 27.5.2.4.3
197  /** It is essential to override this virtual function for the stream
198  member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
199  */
200  virtual std::streamsize showmanyc() {
201  int_type const failure = traits_type::eof();
202  int_type status = underflow();
203  if (status == failure) return -1;
204  return egptr() - gptr();
205  }
206 
207  /// C.f. C++ standard section 27.5.2.4.3
208  virtual int_type underflow() {
209  int_type const failure = traits_type::eof();
210  if (py_read == bp::object()) {
211  throw std::invalid_argument(
212  "That Python file object has no 'read' attribute");
213  }
214  read_buffer = py_read(buffer_size);
215  char* read_buffer_data;
216  bp::ssize_t py_n_read;
217  if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
218  &py_n_read) == -1) {
219  setg(0, 0, 0);
220  throw std::invalid_argument(
221  "The method 'read' of the Python file object "
222  "did not return a string.");
223  }
224  off_type n_read = (off_type)py_n_read;
225  pos_of_read_buffer_end_in_py_file += n_read;
226  setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
227  // ^^^27.5.2.3.1 (4)
228  if (n_read == 0) return failure;
229  return traits_type::to_int_type(read_buffer_data[0]);
230  }
231 
232  /// C.f. C++ standard section 27.5.2.4.5
233  virtual int_type overflow(int_type c = traits_type_eof()) {
234  if (py_write == bp::object()) {
235  throw std::invalid_argument(
236  "That Python file object has no 'write' attribute");
237  }
238  farthest_pptr = std::max(farthest_pptr, pptr());
239  off_type n_written = (off_type)(farthest_pptr - pbase());
240  bp::str chunk(pbase(), farthest_pptr);
241  py_write(chunk);
242  if (!traits_type::eq_int_type(c, traits_type::eof())) {
243  py_write(traits_type::to_char_type(c));
244  n_written++;
245  }
246  if (n_written) {
247  pos_of_write_buffer_end_in_py_file += n_written;
248  setp(pbase(), epptr());
249  // ^^^ 27.5.2.4.5 (5)
250  farthest_pptr = pptr();
251  }
252  return traits_type::eq_int_type(c, traits_type::eof())
253  ? traits_type::not_eof(c)
254  : c;
255  }
256 
257  /// Update the python file to reflect the state of this stream buffer
258  /** Empty the write buffer into the Python file object and set the seek
259  position of the latter accordingly (C++ standard section 27.5.2.4.2).
260  If there is no write buffer or it is empty, but there is a non-empty
261  read buffer, set the Python file object seek position to the
262  seek position in that read buffer.
263  */
264  virtual int sync() {
265  int result = 0;
266  farthest_pptr = std::max(farthest_pptr, pptr());
267  if (farthest_pptr && farthest_pptr > pbase()) {
268  off_type delta = pptr() - farthest_pptr;
269  int_type status = overflow();
270  if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
271  if (py_seek != bp::object()) py_seek(delta, 1);
272  } else if (gptr() && gptr() < egptr()) {
273  if (py_seek != bp::object()) py_seek(gptr() - egptr(), 1);
274  }
275  return result;
276  }
277 
278  /// C.f. C++ standard section 27.5.2.4.2
279  /** This implementation is optimised to look whether the position is within
280  the buffers, so as to avoid calling Python seek or tell. It is
281  important for many applications that the overhead of calling into Python
282  is avoided as much as possible (e.g. parsers which may do a lot of
283  backtracking)
284  */
285  virtual pos_type seekoff(off_type off, std::ios_base::seekdir way,
286  std::ios_base::openmode which = std::ios_base::in |
287  std::ios_base::out) {
288  /* In practice, "which" is either std::ios_base::in or out
289  since we end up here because either seekp or seekg was called
290  on the stream using this buffer. That simplifies the code
291  in a few places.
292  */
293  int const failure = off_type(-1);
294 
295  if (py_seek == bp::object()) {
296  throw std::invalid_argument(
297  "That Python file object has no 'seek' attribute");
298  }
299 
300  // we need the read buffer to contain something!
301  if (which == std::ios_base::in && !gptr()) {
302  if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
303  return failure;
304  }
305  }
306 
307  // compute the whence parameter for Python seek
308  int whence;
309  switch (way) {
310  case std::ios_base::beg:
311  whence = 0;
312  break;
313  case std::ios_base::cur:
314  whence = 1;
315  break;
316  case std::ios_base::end:
317  whence = 2;
318  break;
319  default:
320  return failure;
321  }
322 
323  // Let's have a go
324  boost::optional<off_type> result =
325  seekoff_without_calling_python(off, way, which);
326  if (!result) {
327  // we need to call Python
328  if (which == std::ios_base::out) overflow();
329  if (way == std::ios_base::cur) {
330  if (which == std::ios_base::in)
331  off -= egptr() - gptr();
332  else if (which == std::ios_base::out)
333  off += pptr() - pbase();
334  }
335  py_seek(off, whence);
336  result = off_type(bp::extract<off_type>(py_tell()));
337  if (which == std::ios_base::in) underflow();
338  }
339  return *result;
340  }
341 
342  /// C.f. C++ standard section 27.5.2.4.2
343  virtual pos_type seekpos(pos_type sp,
344  std::ios_base::openmode which = std::ios_base::in |
345  std::ios_base::out) {
346  return streambuf::seekoff(sp, std::ios_base::beg, which);
347  }
348 
349  private:
350  bp::object py_read, py_write, py_seek, py_tell;
351 
352  std::size_t buffer_size;
353 
354  /* This is actually a Python string and the actual read buffer is
355  its internal data, i.e. an array of characters. We use a Boost.Python
356  object so as to hold on it: as a result, the actual buffer can't
357  go away.
358  */
359  bp::object read_buffer;
360 
361  /* A mere array of char's allocated on the heap at construction time and
362  de-allocated only at destruction time.
363  */
364  char* write_buffer;
365 
366  off_type pos_of_read_buffer_end_in_py_file,
367  pos_of_write_buffer_end_in_py_file;
368 
369  // the farthest place the buffer has been written into
370  char* farthest_pptr;
371 
372  boost::optional<off_type> seekoff_without_calling_python(
373  off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
374  boost::optional<off_type> const failure;
375 
376  // Buffer range and current position
377  off_type buf_begin, buf_end, buf_cur, upper_bound;
378  off_type pos_of_buffer_end_in_py_file;
379  if (which == std::ios_base::in) {
380  pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
381  buf_begin = reinterpret_cast<std::streamsize>(eback());
382  buf_cur = reinterpret_cast<std::streamsize>(gptr());
383  buf_end = reinterpret_cast<std::streamsize>(egptr());
384  upper_bound = buf_end;
385  } else if (which == std::ios_base::out) {
386  pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
387  buf_begin = reinterpret_cast<std::streamsize>(pbase());
388  buf_cur = reinterpret_cast<std::streamsize>(pptr());
389  buf_end = reinterpret_cast<std::streamsize>(epptr());
390  farthest_pptr = std::max(farthest_pptr, pptr());
391  upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
392  } else {
393  CHECK_INVARIANT(0, "unreachable code");
394  }
395 
396  // Sought position in "buffer coordinate"
397  off_type buf_sought;
398  if (way == std::ios_base::cur) {
399  buf_sought = buf_cur + off;
400  } else if (way == std::ios_base::beg) {
401  buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
402  } else if (way == std::ios_base::end) {
403  return failure;
404  } else {
405  CHECK_INVARIANT(0, "unreachable code");
406  }
407 
408  // if the sought position is not in the buffer, give up
409  if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
410 
411  // we are in wonderland
412  if (which == std::ios_base::in)
413  gbump(buf_sought - buf_cur);
414  else if (which == std::ios_base::out)
415  pbump(buf_sought - buf_cur);
416  return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
417  }
418 
419  public:
420  class istream : public std::istream {
421  public:
422  istream(streambuf& buf) : std::istream(&buf) {
423  exceptions(std::ios_base::badbit);
424  }
425 
427  // do nothing.
428  // This used to do:
429  // if (this->good()) this->sync();
430  // but that caused problems if the underlying file had been closed
431  // (see github #579) and really doesn't seem necessary for what we're
432  // doing.
433  }
434  };
435 
436  class ostream : public std::ostream {
437  public:
438  ostream(streambuf& buf) : std::ostream(&buf) {
439  exceptions(std::ios_base::badbit);
440  }
441 
443  if (this->good()) this->flush();
444  }
445  };
446 };
447 
448 // std::size_t streambuf::default_buffer_size = 1024;
449 
452 
453  streambuf_capsule(bp::object& python_file_obj, std::size_t buffer_size = 0)
454  : python_streambuf(python_file_obj, buffer_size) {}
455 };
456 
458  ostream(bp::object& python_file_obj, std::size_t buffer_size = 0)
459  : streambuf_capsule(python_file_obj, buffer_size),
460  streambuf::ostream(python_streambuf) {}
461 
462  ~ostream() throw() {
463  try {
464  if (this->good()) this->flush();
465  } catch (bp::error_already_set&) {
466  PyErr_Clear();
467  throw std::runtime_error(
468  "Problem closing python ostream.\n"
469  " Known limitation: the error is unrecoverable. Sorry.\n"
470  " Suggestion for programmer: add ostream.flush() before"
471  " returning.");
472  }
473  }
474 };
475 } // namespace python
476 } // namespace boost_adaptbx
477 
478 #endif // GUARD
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)
virtual int sync()
Update the python file to reflect the state of this stream buffer.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
static const std::size_t default_buffer_size
The default size of the read and write buffer.
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:100
STL namespace.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
virtual int_type underflow()
C.f. C++ standard section 27.5.2.4.3.
virtual int_type overflow(int_type c=traits_type_eof())
C.f. C++ standard section 27.5.2.4.5.
#define TEST_ASSERT(expr)
Definition: Invariant.h:151
virtual pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
virtual std::streamsize showmanyc()
C.f. C++ standard section 27.5.2.4.3.
virtual ~streambuf()
Mundane destructor freeing the allocated resources.
A stream buffer getting data from and putting data into a Python file object.