/* * C side structures to provide capabilities to read Python file like objects * in chunks, or iterate through iterables with each result representing a * single line of a file. */ #define PY_SSIZE_T_CLEAN #include #include #include #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #include "numpy/arrayobject.h" #include "textreading/stream.h" #define READ_CHUNKSIZE 1 << 14 typedef struct { stream stream; /* The Python file object being read. */ PyObject *file; /* The `read` attribute of the file object. */ PyObject *read; /* Amount to read each time we call `obj.read()` */ PyObject *chunksize; /* Python str object holding the line most recently read from the file. */ PyObject *chunk; /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ const char *encoding; } python_chunks_from_file; /* * Helper function to support byte objects as well as unicode strings. * * NOTE: Steals a reference to `str` (although usually returns it unmodified). */ static inline PyObject * process_stringlike(PyObject *str, const char *encoding) { if (PyBytes_Check(str)) { PyObject *ustr; ustr = PyUnicode_FromEncodedObject(str, encoding, NULL); if (ustr == NULL) { return NULL; } Py_DECREF(str); return ustr; } else if (!PyUnicode_Check(str)) { PyErr_SetString(PyExc_TypeError, "non-string returned while reading data"); Py_DECREF(str); return NULL; } return str; } static inline void buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind) { Py_ssize_t length = PyUnicode_GET_LENGTH(str); *kind = PyUnicode_KIND(str); if (*kind == PyUnicode_1BYTE_KIND) { *start = (char *)PyUnicode_1BYTE_DATA(str); } else if (*kind == PyUnicode_2BYTE_KIND) { *start = (char *)PyUnicode_2BYTE_DATA(str); length *= sizeof(Py_UCS2); } else if (*kind == PyUnicode_4BYTE_KIND) { *start = (char *)PyUnicode_4BYTE_DATA(str); length *= sizeof(Py_UCS4); } *end = *start + length; } static int fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind) { Py_XDECREF(fb->chunk); fb->chunk = NULL; PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL); if (chunk == NULL) { return -1; } fb->chunk = process_stringlike(chunk, fb->encoding); if (fb->chunk == NULL) { return -1; } buffer_info_from_unicode(fb->chunk, start, end, kind); if (*start == *end) { return BUFFER_IS_FILEEND; } return BUFFER_MAY_CONTAIN_NEWLINE; } static int fb_del(stream *strm) { python_chunks_from_file *fb = (python_chunks_from_file *)strm; Py_XDECREF(fb->file); Py_XDECREF(fb->read); Py_XDECREF(fb->chunksize); Py_XDECREF(fb->chunk); PyMem_FREE(strm); return 0; } NPY_NO_EXPORT stream * stream_python_file(PyObject *obj, const char *encoding) { python_chunks_from_file *fb; fb = (python_chunks_from_file *)PyMem_Calloc(1, sizeof(python_chunks_from_file)); if (fb == NULL) { PyErr_NoMemory(); return NULL; } fb->stream.stream_nextbuf = (void *)&fb_nextbuf; fb->stream.stream_close = &fb_del; fb->encoding = encoding; Py_INCREF(obj); fb->file = obj; fb->read = PyObject_GetAttrString(obj, "read"); if (fb->read == NULL) { goto fail; } fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE); if (fb->chunksize == NULL) { goto fail; } return (stream *)fb; fail: fb_del((stream *)fb); return NULL; } /* * Stream from a Python iterable by interpreting each item as a line in a file */ typedef struct { stream stream; /* The Python file object being read. */ PyObject *iterator; /* Python str object holding the line most recently fetched */ PyObject *line; /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ const char *encoding; } python_lines_from_iterator; static int it_del(stream *strm) { python_lines_from_iterator *it = (python_lines_from_iterator *)strm; Py_XDECREF(it->iterator); Py_XDECREF(it->line); PyMem_FREE(strm); return 0; } static int it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind) { Py_XDECREF(it->line); it->line = NULL; PyObject *line = PyIter_Next(it->iterator); if (line == NULL) { if (PyErr_Occurred()) { return -1; } *start = NULL; *end = NULL; return BUFFER_IS_FILEEND; } it->line = process_stringlike(line, it->encoding); if (it->line == NULL) { return -1; } buffer_info_from_unicode(it->line, start, end, kind); return BUFFER_IS_LINEND; } NPY_NO_EXPORT stream * stream_python_iterable(PyObject *obj, const char *encoding) { python_lines_from_iterator *it; if (!PyIter_Check(obj)) { PyErr_SetString(PyExc_TypeError, "error reading from object, expected an iterable."); return NULL; } it = (python_lines_from_iterator *)PyMem_Calloc(1, sizeof(*it)); if (it == NULL) { PyErr_NoMemory(); return NULL; } it->stream.stream_nextbuf = (void *)&it_nextbuf; it->stream.stream_close = &it_del; it->encoding = encoding; Py_INCREF(obj); it->iterator = obj; return (stream *)it; }