/* Copyright (C) 2013 Phillip Susi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . */ #include "PipeCapture.h" #include "Utils.h" #include #include #include #include #include #include #include #include namespace GParted { const size_t READBUF_SIZE = 64*KIBIBYTE; const gunichar UTF8_PARTIAL = (gunichar)-2; const gunichar UTF8_INVALID = (gunichar)-1; PipeCapture::PipeCapture( int fd, Glib::ustring &buffer ) : fill_offset( 0 ), cursor( 0 ), line_start( 0 ), callerbuf( buffer ) { readbuf = new char[READBUF_SIZE]; callerbuf.clear(); callerbuf_uptodate = true; // tie fd to string // make channel channel = Glib::IOChannel::create_from_fd( fd ); channel->set_encoding(""); } void PipeCapture::connect_signal() { // connect handler to signal input/output g_io_add_watch( channel->gobj(), GIOCondition(G_IO_IN | G_IO_ERR | G_IO_HUP), _OnReadable, this ); } gboolean PipeCapture::_OnReadable( GIOChannel *source, GIOCondition condition, gpointer data ) { PipeCapture *pc = static_cast(data); gboolean rc = pc->OnReadable( Glib::IOCondition(condition) ); return rc; } bool PipeCapture::OnReadable( Glib::IOCondition condition ) { // Reads UTF-8 characters from channel. Provides minimal interpretation so // programs which use text progress bars are displayed correctly. Captures the // output in a buffer and runs callbacks when updated or EOF reached. // // Data model: // // fill_offset // v // readbuf "XXXX......................" // ^ ^ // | end_ptr // read_ptr // // linevec "Current line. Text progress bar: XXXXXXXX--------" // ^ // cursor // // capturebuf "First line\n // Current line. Text progress bar: XXXX-----------" // ^ // line_start // // Processing details: // Bytes are read into readbuf. Valid UTF-8 character byte sequences are // recognised and, applying a simple line discipline, added into the vector of // characters storing the current line, linevec. (Linevec uses UCS-4 encoding for // fixed sized values accessible in constant time via pointer arithmetic). When // a new line character is encountered the complete current line, or when readbuf // is drained the partial current line, is pasted into capturebuf at the offset // where the last line starts. (Capturebuf stores UTF-8 encoded characters in a // std::string for constant time access to line_start offset). When readbuf // is drained and there are registered update callbacks, capturebuf is copied into // callerbuf and signal_update slot fired. (Callerbuf stores UTF-8 encoded // characters in a Glib::ustring). When EOF is encountered capturebuf is copied // into callerbuf if required and signal_eof slot fired. // // Golden rule: // Use Glib::ustrings as little as possible for large amounts of data! // 1) Glib::ustring::iterators use pointer access under the hood and are fast, but // 1.1) the Glib::ustring must only contain valid UTF-8 bytes otherwise // operator++(), operator--() and operator*() may read past the end of the // string until a segfault occurs; and // 1.2) become invalid leaving them pointing at the old memory after the // underlying storage is reallocated to accommodate storing extra // characters. // 2) Indexed character access into Glib::ustrings reads all the variable width // UTF-8 encoded characters from the start of the string until the particular // indexed character is reached. Replacing characters gets exponentially // slower as the string gets longer and all characters beyond those replaced // have to be moved in memory. gsize bytes_read; Glib::IOStatus status = channel->read( readbuf + fill_offset, READBUF_SIZE - fill_offset, bytes_read ); if ( status == Glib::IO_STATUS_NORMAL ) { const char * read_ptr = readbuf; const char * end_ptr = readbuf + fill_offset + bytes_read; fill_offset = 0; while ( read_ptr < end_ptr ) { gunichar uc = get_utf8_char_validated(read_ptr, end_ptr - read_ptr); if ( uc == UTF8_PARTIAL ) { // Partial UTF-8 character at end of read buffer. Copy to // start of read buffer. size_t bytes_remaining = end_ptr - read_ptr; memcpy( readbuf, read_ptr, bytes_remaining ); fill_offset = bytes_remaining; break; } else if ( uc == UTF8_INVALID ) { // Skip invalid byte. read_ptr ++; continue; } else { // Advance read pointer past the read UTF-8 character. const char * new_ptr = g_utf8_find_next_char( read_ptr, end_ptr ); if ( new_ptr == read_ptr && *read_ptr == '\0' ) // Workaround bug in g_utf8_find_next_char() which // stops it advancing past NUL char in buffer // delimited by an end pointer. new_ptr ++; read_ptr = new_ptr; if (read_ptr == nullptr) read_ptr = end_ptr; } if ( uc == '\b' ) { if ( cursor > 0 ) cursor --; } else if ( uc == '\r' ) { cursor = 0; } else if ( uc == '\n' ) { // Append char to current line; paste current line to // capture buffer; reset current line. linevec.push_back( '\n' ); cursor ++; capturebuf.resize( line_start ); append_unichar_vector_to_utf8( capturebuf, linevec ); line_start = capturebuf.size(); callerbuf_uptodate = false; linevec.clear(); cursor = 0; } else if ( uc == '\x01' || uc == '\x02' ) { // Skip Ctrl-A and Ctrl-B chars e2fsck uses to bracket the progress bar continue; } else { if ( cursor < linevec.size() ) { // Replace char in current line. linevec[cursor] = uc; cursor ++; } else { // Append char to current line. linevec.push_back( uc ); cursor ++; } } } // Paste partial line to capture buffer. capturebuf.resize( line_start ); append_unichar_vector_to_utf8( capturebuf, linevec ); callerbuf_uptodate = false; if ( ! signal_update.empty() ) { // Performance optimisation, especially for large capture buffers: // only copy capture buffer to callers buffer and fire update // callbacks when there are any registered update callbacks. callerbuf = capturebuf; callerbuf_uptodate = true; signal_update.emit(); } return true; } if ( status != Glib::IO_STATUS_EOF ) { std::cerr << "Pipe IOChannel read failed" << std::endl; } if ( ! callerbuf_uptodate ) { callerbuf = capturebuf; callerbuf_uptodate = true; } // signal completion signal_eof.emit(); return false; } void PipeCapture::append_unichar_vector_to_utf8( std::string & str, const std::vector & ucvec ) { const size_t MAX_UTF8_BYTES = 6; char buf[MAX_UTF8_BYTES]; for ( unsigned int i = 0 ; i < ucvec.size() ; i ++ ) { int bytes_written = g_unichar_to_utf8( ucvec[i], buf ); str.append( buf, bytes_written ); } } // GLib's g_utf8_get_char_validated() always considers strings as being NUL terminated, // even when max_len is specified, hence can't read NUL characters. This wrapper can read // NUL characters when max_len is specified. // Reference: // https://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-utf8-get-char-validated gunichar PipeCapture::get_utf8_char_validated(const char *p, gssize max_len) { gunichar uc = g_utf8_get_char_validated(p, max_len); if (uc == UTF8_PARTIAL && max_len > 0) { // Report NUL character as such. if (*p == '\0') return '\0'; // If g_utf8_get_char_validated() found a NUL byte in the middle of a // multi-byte character, even when there are more bytes available as // specified by max_len, it reports a partial UTF-8 character. Report // this case as an invalid character instead. int len = utf8_char_length(*p); if (len == -1 || (gssize)len <= max_len) uc = UTF8_INVALID; } return uc; } int PipeCapture::utf8_char_length( unsigned char firstbyte ) { // Recognise the size of FSS-UTF (1992) / UTF-8 (1993) characters given the first // byte. Characters can be up to 6 bytes. (Later UTF-8 (2003) limited characters // to 4 bytes and 21-bits of Unicode code-space). // Reference: // https://en.wikipedia.org/wiki/UTF-8 if ( ( firstbyte & 0x80 ) == 0x00 ) // 0xxxxxxx - 1 byte UTF-8 char return 1; else if ( ( firstbyte & 0xE0 ) == 0xC0 ) // 110xxxxx - First byte of a 2 byte UTF-8 char return 2; else if ( ( firstbyte & 0xF0 ) == 0xE0 ) // 1110xxxx - First byte of a 3 byte UTF-8 char return 3; else if ( ( firstbyte & 0xF8 ) == 0xF0 ) // 11110xxx - First byte of a 4 byte UTF-8 char return 4; else if ( ( firstbyte & 0xFC ) == 0xF8 ) // 111110xx - First byte of a 5 byte UTF-8 char return 5; else if ( ( firstbyte & 0xFE ) == 0xFC ) // 1111110x - First byte of a 6 byte UTF-8 char return 6; else if ( ( firstbyte & 0xC0 ) == 0x80 ) // 10xxxxxx - Continuation byte return -1; else // Invalid byte return -1; } PipeCapture::~PipeCapture() { delete[] readbuf; } } // namespace GParted