gparted/src/PipeCapture.cc

310 lines
9.9 KiB
C++

/* Copyright (C) 2013 Phillip Susi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "PipeCapture.h"
#include "Utils.h"
#include <iostream>
#include <string>
#include <vector>
#include <stddef.h>
#include <string.h>
#include <glib.h>
#include <glibmm/ustring.h>
#include <glibmm/iochannel.h>
namespace GParted {
const size_t READBUF_SIZE = 64*KIBIBYTE;
const gunichar UTF8_PARTIAL = (gunichar)-2;
const gunichar UTF8_INVALID = (gunichar)-1;
PipeCapture::PipeCapture( int fd, Glib::ustring &buffer ) : fill_offset( 0 ),
cursor( 0 ),
line_start( 0 ),
callerbuf( buffer )
{
readbuf = new char[READBUF_SIZE];
callerbuf.clear();
callerbuf_uptodate = true;
// tie fd to string
// make channel
channel = Glib::IOChannel::create_from_fd( fd );
channel->set_encoding("");
}
void PipeCapture::connect_signal()
{
// connect handler to signal input/output
g_io_add_watch( channel->gobj(),
GIOCondition(G_IO_IN | G_IO_ERR | G_IO_HUP),
_OnReadable,
this );
}
gboolean PipeCapture::_OnReadable( GIOChannel *source,
GIOCondition condition,
gpointer data )
{
PipeCapture *pc = static_cast<PipeCapture *>(data);
gboolean rc = pc->OnReadable( Glib::IOCondition(condition) );
return rc;
}
bool PipeCapture::OnReadable( Glib::IOCondition condition )
{
// Reads UTF-8 characters from channel. Provides minimal interpretation so
// programs which use text progress bars are displayed correctly. Captures the
// output in a buffer and runs callbacks when updated or EOF reached.
//
// Data model:
//
// fill_offset
// v
// readbuf "XXXX......................"
// ^ ^
// | end_ptr
// read_ptr
//
// linevec "Current line. Text progress bar: XXXXXXXX--------"
// ^
// cursor
//
// capturebuf "First line\n
// Current line. Text progress bar: XXXX-----------"
// ^
// line_start
//
// Processing details:
// Bytes are read into readbuf. Valid UTF-8 character byte sequences are
// recognised and, applying a simple line discipline, added into the vector of
// characters storing the current line, linevec. (Linevec uses UCS-4 encoding for
// fixed sized values accessible in constant time via pointer arithmetic). When
// a new line character is encountered the complete current line, or when readbuf
// is drained the partial current line, is pasted into capturebuf at the offset
// where the last line starts. (Capturebuf stores UTF-8 encoded characters in a
// std::string for constant time access to line_start offset). When readbuf
// is drained and there are registered update callbacks, capturebuf is copied into
// callerbuf and signal_update slot fired. (Callerbuf stores UTF-8 encoded
// characters in a Glib::ustring). When EOF is encountered capturebuf is copied
// into callerbuf if required and signal_eof slot fired.
//
// Golden rule:
// Use Glib::ustrings as little as possible for large amounts of data!
// 1) Glib::ustring::iterators use pointer access under the hood and are fast, but
// 1.1) the Glib::ustring must only contain valid UTF-8 bytes otherwise
// operator++(), operator--() and operator*() may read past the end of the
// string until a segfault occurs; and
// 1.2) become invalid leaving them pointing at the old memory after the
// underlying storage is reallocated to accommodate storing extra
// characters.
// 2) Indexed character access into Glib::ustrings reads all the variable width
// UTF-8 encoded characters from the start of the string until the particular
// indexed character is reached. Replacing characters gets exponentially
// slower as the string gets longer and all characters beyond those replaced
// have to be moved in memory.
gsize bytes_read;
Glib::IOStatus status = channel->read( readbuf + fill_offset, READBUF_SIZE - fill_offset, bytes_read );
if ( status == Glib::IO_STATUS_NORMAL )
{
const char * read_ptr = readbuf;
const char * end_ptr = readbuf + fill_offset + bytes_read;
fill_offset = 0;
while ( read_ptr < end_ptr )
{
gunichar uc = get_utf8_char_validated(read_ptr, end_ptr - read_ptr);
if ( uc == UTF8_PARTIAL )
{
// Partial UTF-8 character at end of read buffer. Copy to
// start of read buffer.
size_t bytes_remaining = end_ptr - read_ptr;
memcpy( readbuf, read_ptr, bytes_remaining );
fill_offset = bytes_remaining;
break;
}
else if ( uc == UTF8_INVALID )
{
// Skip invalid byte.
read_ptr ++;
continue;
}
else
{
// Advance read pointer past the read UTF-8 character.
const char * new_ptr = g_utf8_find_next_char( read_ptr, end_ptr );
if ( new_ptr == read_ptr && *read_ptr == '\0' )
// Workaround bug in g_utf8_find_next_char() which
// stops it advancing past NUL char in buffer
// delimited by an end pointer.
new_ptr ++;
read_ptr = new_ptr;
if (read_ptr == nullptr)
read_ptr = end_ptr;
}
if ( uc == '\b' )
{
if ( cursor > 0 )
cursor --;
}
else if ( uc == '\r' )
{
cursor = 0;
}
else if ( uc == '\n' )
{
// Append char to current line; paste current line to
// capture buffer; reset current line.
linevec.push_back( '\n' );
cursor ++;
capturebuf.resize( line_start );
append_unichar_vector_to_utf8( capturebuf, linevec );
line_start = capturebuf.size();
callerbuf_uptodate = false;
linevec.clear();
cursor = 0;
}
else if ( uc == '\x01' || uc == '\x02' )
{
// Skip Ctrl-A and Ctrl-B chars e2fsck uses to bracket the progress bar
continue;
}
else
{
if ( cursor < linevec.size() )
{
// Replace char in current line.
linevec[cursor] = uc;
cursor ++;
}
else
{
// Append char to current line.
linevec.push_back( uc );
cursor ++;
}
}
}
// Paste partial line to capture buffer.
capturebuf.resize( line_start );
append_unichar_vector_to_utf8( capturebuf, linevec );
callerbuf_uptodate = false;
if ( ! signal_update.empty() )
{
// Performance optimisation, especially for large capture buffers:
// only copy capture buffer to callers buffer and fire update
// callbacks when there are any registered update callbacks.
callerbuf = capturebuf;
callerbuf_uptodate = true;
signal_update.emit();
}
return true;
}
if ( status != Glib::IO_STATUS_EOF )
{
std::cerr << "Pipe IOChannel read failed" << std::endl;
}
if ( ! callerbuf_uptodate )
{
callerbuf = capturebuf;
callerbuf_uptodate = true;
}
// signal completion
signal_eof.emit();
return false;
}
void PipeCapture::append_unichar_vector_to_utf8( std::string & str, const std::vector<gunichar> & ucvec )
{
const size_t MAX_UTF8_BYTES = 6;
char buf[MAX_UTF8_BYTES];
for ( unsigned int i = 0 ; i < ucvec.size() ; i ++ )
{
int bytes_written = g_unichar_to_utf8( ucvec[i], buf );
str.append( buf, bytes_written );
}
}
// GLib's g_utf8_get_char_validated() always considers strings as being NUL terminated,
// even when max_len is specified, hence can't read NUL characters. This wrapper can read
// NUL characters when max_len is specified.
// Reference:
// https://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-utf8-get-char-validated
gunichar PipeCapture::get_utf8_char_validated(const char *p, gssize max_len)
{
gunichar uc = g_utf8_get_char_validated(p, max_len);
if (uc == UTF8_PARTIAL && max_len > 0)
{
// Report NUL character as such.
if (*p == '\0')
return '\0';
// If g_utf8_get_char_validated() found a NUL byte in the middle of a
// multi-byte character, even when there are more bytes available as
// specified by max_len, it reports a partial UTF-8 character. Report
// this case as an invalid character instead.
int len = utf8_char_length(*p);
if (len == -1 || (gssize)len <= max_len)
uc = UTF8_INVALID;
}
return uc;
}
int PipeCapture::utf8_char_length( unsigned char firstbyte )
{
// Recognise the size of FSS-UTF (1992) / UTF-8 (1993) characters given the first
// byte. Characters can be up to 6 bytes. (Later UTF-8 (2003) limited characters
// to 4 bytes and 21-bits of Unicode code-space).
// Reference:
// https://en.wikipedia.org/wiki/UTF-8
if ( ( firstbyte & 0x80 ) == 0x00 ) // 0xxxxxxx - 1 byte UTF-8 char
return 1;
else if ( ( firstbyte & 0xE0 ) == 0xC0 ) // 110xxxxx - First byte of a 2 byte UTF-8 char
return 2;
else if ( ( firstbyte & 0xF0 ) == 0xE0 ) // 1110xxxx - First byte of a 3 byte UTF-8 char
return 3;
else if ( ( firstbyte & 0xF8 ) == 0xF0 ) // 11110xxx - First byte of a 4 byte UTF-8 char
return 4;
else if ( ( firstbyte & 0xFC ) == 0xF8 ) // 111110xx - First byte of a 5 byte UTF-8 char
return 5;
else if ( ( firstbyte & 0xFE ) == 0xFC ) // 1111110x - First byte of a 6 byte UTF-8 char
return 6;
else if ( ( firstbyte & 0xC0 ) == 0x80 ) // 10xxxxxx - Continuation byte
return -1;
else // Invalid byte
return -1;
}
PipeCapture::~PipeCapture()
{
delete[] readbuf;
}
} // namespace GParted