Improve the performance of PipeCapture for large output (#777973)
A user had a very corrupted FAT file system such that fsck.fat produced 122 MiB of output. GParted has to read this output to get the file system usage information. However GParted takes more than 48 hours to read the 122 MiB of output, while using 100% CPU time and is unresponsive for the duration. Modified fsck.fat to output just the first 1 MiB of output and used perf to capture performance data from GParted reading that output: # perf -g -F 1999 -- ./gpartedbin # perf report --stdio 67.84% Glib::ustring::replace [4.23s] 17.67% g_utf8_pointer_to_offset [1.10s] 8.48% g_utf8_offset_to_pointer [0.53s] [ 6.01% (everything else) ] [0.38s] [100.00% TOTAL ] [6.24s] And to read the first 10 MiB of output the performance figures are: 92.95% Glib::ustring::replace [257.44s] 4.35% g_utf8_pointer_to_offset [ 12.05s] 2.13% g_utf8_offset_to_pointer [ 5.90s] [ 0.58% (everything else) ] [ 1.61s] [100.00% TOTAL ] [277.00s] See how the total time is increasing non-linearly, 44 times longer for only 10 times as much data. This is because of the exponential increase in time spent in Glib::ustring::replace. After a lot of experimentation I came to the conclusion that Glib::ustrings are not appropriate for storing and editing large buffers of data, sizes megabytes and above. The issues are that iterators are invalid after the content changes and replacing UTF-8 characters by index gets exponentially slower as the size of the string increases. Hence the > 48 hours of 100% CPU time to read and apply the line discipline to the 122 MiB of fsck.fat output. See code comment for a more detailed description of the issues found. Rewrote OnReadable() to use Glib::ustrings as little as possible. Instead using buffers and vectors of fixed width data types allowing for fast access using pointers and indexes (converted to pointers by the compiler with simple arithmetic). Again see code comment for a more detailed description of the implementation. Repeating the performance capture with the new code for the first 1 MiB of fsck.fat output: 63.34% memcpy [0.35s] [ 36.66% (everything else) ] [0.21s] [100.00% TOTAL ] [0.56s] And for the first 10 MiB of fsck.fat output: 96.66% memcpy [63.60s] [ 3.34% (everything else) ] [ 2.20s] [100.00% TOTAL ] [65.80s] Simple timings taken to read portions of the fsck.fat output (when not using perf): 1 MiB 10 MiB 122 MiB old code : 6.2 sec 277 sec > 48 hours (4:37) new code : 0.6 sec 66 sec 17262 sec (1:06) (4:47:42) Performance of the code is still non-linear because of the assignment of the ever growing capturebuf to callerbuf for every block of input read. This is required to generate a consistent Glib::ustring copy of the input for the update callback. However this is much faster than before, and I have a plan for further improvements. Bug 777973 - Segmentation fault on bad disk
This commit is contained in:
parent
03c2be9b90
commit
b5b3d199f8
|
@ -17,8 +17,12 @@
|
||||||
#ifndef GPARTED_PIPECAPTURE_H
|
#ifndef GPARTED_PIPECAPTURE_H
|
||||||
#define GPARTED_PIPECAPTURE_H
|
#define GPARTED_PIPECAPTURE_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <stddef.h> // typedef size_t
|
||||||
|
#include <glib.h> // typedef gunichar
|
||||||
#include <glibmm/ustring.h>
|
#include <glibmm/ustring.h>
|
||||||
#include <glibmm/main.h>
|
#include <glibmm/refptr.h>
|
||||||
#include <glibmm/iochannel.h>
|
#include <glibmm/iochannel.h>
|
||||||
|
|
||||||
namespace GParted {
|
namespace GParted {
|
||||||
|
@ -26,21 +30,30 @@ namespace GParted {
|
||||||
// captures output pipe of subprocess into a ustring and emits a signal on eof
|
// captures output pipe of subprocess into a ustring and emits a signal on eof
|
||||||
class PipeCapture
|
class PipeCapture
|
||||||
{
|
{
|
||||||
Glib::ustring &buff;
|
public:
|
||||||
Glib::ustring::size_type linestart ;
|
PipeCapture( int fd, Glib::ustring &buffer );
|
||||||
Glib::ustring::size_type cursor ;
|
~PipeCapture();
|
||||||
Glib::ustring::size_type lineend ;
|
|
||||||
Glib::RefPtr<Glib::IOChannel> channel;
|
void connect_signal();
|
||||||
|
sigc::signal<void> signal_eof;
|
||||||
|
sigc::signal<void> signal_update;
|
||||||
|
|
||||||
|
private:
|
||||||
bool OnReadable( Glib::IOCondition condition );
|
bool OnReadable( Glib::IOCondition condition );
|
||||||
static gboolean _OnReadable( GIOChannel *source,
|
static gboolean _OnReadable( GIOChannel *source,
|
||||||
GIOCondition condition,
|
GIOCondition condition,
|
||||||
gpointer data );
|
gpointer data );
|
||||||
public:
|
static void append_unichar_vector_to_utf8( std::string & str,
|
||||||
PipeCapture( int fd, Glib::ustring &buffer );
|
const std::vector<gunichar> & ucvec );
|
||||||
void connect_signal();
|
|
||||||
~PipeCapture();
|
Glib::RefPtr<Glib::IOChannel> channel; // Wrapper around fd
|
||||||
sigc::signal<void> signal_eof;
|
char * readbuf; // Bytes read from IOChannel (fd)
|
||||||
sigc::signal<void> signal_update;
|
size_t fill_offset; // Filling offset into readbuf
|
||||||
|
std::vector<gunichar> linevec; // Current line stored as UCS-4 characters
|
||||||
|
size_t cursor; // Cursor position index into linevec
|
||||||
|
std::string capturebuf; // Captured output as UTF-8 characters
|
||||||
|
size_t line_start; // Index into bytebuf where current line starts
|
||||||
|
Glib::ustring & callerbuf; // Reference to caller supplied buffer
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namepace GParted
|
} // namepace GParted
|
||||||
|
|
|
@ -17,15 +17,25 @@
|
||||||
#include "PipeCapture.h"
|
#include "PipeCapture.h"
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <glib.h> // typedef gunichar
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <glib.h>
|
||||||
#include <glibmm/ustring.h>
|
#include <glibmm/ustring.h>
|
||||||
|
#include <glibmm/iochannel.h>
|
||||||
|
|
||||||
namespace GParted {
|
namespace GParted {
|
||||||
|
|
||||||
PipeCapture::PipeCapture( int fd, Glib::ustring &string ) : buff( string ),
|
const size_t READBUF_SIZE = 512;
|
||||||
linestart( 0 ), cursor( 0 ), lineend( 0 )
|
|
||||||
|
PipeCapture::PipeCapture( int fd, Glib::ustring &buffer ) : fill_offset( 0 ),
|
||||||
|
cursor( 0 ),
|
||||||
|
line_start( 0 ),
|
||||||
|
callerbuf( buffer )
|
||||||
{
|
{
|
||||||
buff.clear();
|
readbuf = new char[READBUF_SIZE];
|
||||||
|
callerbuf.clear();
|
||||||
// tie fd to string
|
// tie fd to string
|
||||||
// make channel
|
// make channel
|
||||||
channel = Glib::IOChannel::create_from_fd( fd );
|
channel = Glib::IOChannel::create_from_fd( fd );
|
||||||
|
@ -53,60 +63,171 @@ gboolean PipeCapture::_OnReadable( GIOChannel *source,
|
||||||
|
|
||||||
bool PipeCapture::OnReadable( Glib::IOCondition condition )
|
bool PipeCapture::OnReadable( Glib::IOCondition condition )
|
||||||
{
|
{
|
||||||
//Read from pipe and store in buff. Provide minimal interpretation so
|
// Reads UTF-8 characters from channel. Provides minimal interpretation so
|
||||||
// programs which use text progress bars are displayed correctly.
|
// programs which use text progress bars are displayed correctly. Captures the
|
||||||
// Linestart, cursor and lineend are offsets into buff like this:
|
// output in a buffer and runs callbacks when updated or EOF reached.
|
||||||
// "Previous line\n
|
//
|
||||||
// Current line. Text progress bar: XXXXXXXXXX----------"
|
// Data model:
|
||||||
// ^ ^ ^
|
//
|
||||||
// linestart cursor lineend
|
// fill_offset
|
||||||
Glib::ustring str;
|
// v
|
||||||
Glib::IOStatus status = channel->read( str, 512 );
|
// readbuf "XXXX......................"
|
||||||
|
// ^ ^
|
||||||
|
// | end_ptr
|
||||||
|
// read_ptr
|
||||||
|
//
|
||||||
|
// linevec "Current line. Text progress bar: XXXXXXXX--------"
|
||||||
|
// ^
|
||||||
|
// cursor
|
||||||
|
//
|
||||||
|
// capturebuf "First line\n
|
||||||
|
// Current line. Text progress bar: XXXX-----------"
|
||||||
|
// ^
|
||||||
|
// line_start
|
||||||
|
//
|
||||||
|
// Processing details:
|
||||||
|
// Bytes are read into readbuf. Valid UTF-8 character byte sequences are
|
||||||
|
// recognised and, applying a simple line discipline, added into the vector of
|
||||||
|
// characters storing the current line, linevec. (Linevec uses UCS-4 encoding for
|
||||||
|
// fixed sized values accessible in constant time via pointer arithmetic). When
|
||||||
|
// a new line character is encountered the complete current line, or when readbuf
|
||||||
|
// is drained the partial current line, is pasted into capturebuf at the offset
|
||||||
|
// where the last line starts. (Capturebuf stores UTF-8 encoded characters in a
|
||||||
|
// std::string for constant time access to line_start offset). When readbuf
|
||||||
|
// is drained capturebuf is copied into callerbuf and signal_update slot fired.
|
||||||
|
// (Callerbuf stores UTF-8 encoded characters in a Glib::ustring). When EOF is
|
||||||
|
// encountered capturebuf is copied into callerbuf if required and signal_eof slot
|
||||||
|
// fired.
|
||||||
|
//
|
||||||
|
// Golden rule:
|
||||||
|
// Use Glib::ustrings as little as possible for large amounts of data!
|
||||||
|
// 1) Glib::ustring::iterators use pointer access under the hood and are fast, but
|
||||||
|
// 1.1) the Glib::ustring must only contain valid UTF-8 bytes otherwise
|
||||||
|
// operator++(), operator--() and operator*() may read past the end of the
|
||||||
|
// string until a segfault occurs; and
|
||||||
|
// 1.2) become invalid leaving them pointing at the old memory after the
|
||||||
|
// underlying storage is reallocated to accommodate storing extra
|
||||||
|
// characters.
|
||||||
|
// 2) Indexed character access into Glib::ustrings reads all the variable width
|
||||||
|
// UTF-8 encoded characters from the start of the string until the particular
|
||||||
|
// indexed character is reached. Replacing characters gets exponentially
|
||||||
|
// slower as the string gets longer and all characters beyond those replaced
|
||||||
|
// have to be moved in memory.
|
||||||
|
|
||||||
|
gsize bytes_read;
|
||||||
|
Glib::IOStatus status = channel->read( readbuf + fill_offset, READBUF_SIZE - fill_offset, bytes_read );
|
||||||
if ( status == Glib::IO_STATUS_NORMAL )
|
if ( status == Glib::IO_STATUS_NORMAL )
|
||||||
{
|
{
|
||||||
for ( unsigned int i = 0 ; i < str.size() ; i ++ )
|
const char * read_ptr = readbuf;
|
||||||
|
const char * end_ptr = readbuf + fill_offset + bytes_read;
|
||||||
|
fill_offset = 0;
|
||||||
|
while ( read_ptr < end_ptr )
|
||||||
{
|
{
|
||||||
gunichar uc = str[i];
|
const gunichar UTF8_PARTIAL = (gunichar)-2;
|
||||||
if ( uc == '\b' ) {
|
const gunichar UTF8_INVALID = (gunichar)-1;
|
||||||
if ( cursor > linestart )
|
gunichar uc = g_utf8_get_char_validated( read_ptr, end_ptr - read_ptr );
|
||||||
|
if ( uc == UTF8_PARTIAL )
|
||||||
|
{
|
||||||
|
// Partial UTF-8 character at end of read buffer. Copy to
|
||||||
|
// start of read buffer.
|
||||||
|
size_t bytes_remaining = end_ptr - read_ptr;
|
||||||
|
memcpy( readbuf, read_ptr, bytes_remaining );
|
||||||
|
fill_offset = bytes_remaining;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if ( uc == UTF8_INVALID )
|
||||||
|
{
|
||||||
|
// Skip invalid byte.
|
||||||
|
read_ptr ++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Advance read pointer past the read UTF-8 character.
|
||||||
|
read_ptr = g_utf8_find_next_char( read_ptr, end_ptr );
|
||||||
|
if ( read_ptr == NULL )
|
||||||
|
read_ptr = end_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( uc == '\b' )
|
||||||
|
{
|
||||||
|
if ( cursor > 0 )
|
||||||
cursor --;
|
cursor --;
|
||||||
}
|
}
|
||||||
else if ( uc == '\r' )
|
else if ( uc == '\r' )
|
||||||
cursor = linestart ;
|
{
|
||||||
else if ( uc == '\n' ) {
|
cursor = 0;
|
||||||
cursor = lineend ;
|
}
|
||||||
buff .append( 1, '\n' ) ;
|
else if ( uc == '\n' )
|
||||||
|
{
|
||||||
|
// Append char to current line; paste current line to
|
||||||
|
// capture buffer; reset current line.
|
||||||
|
linevec.push_back( '\n' );
|
||||||
cursor ++;
|
cursor ++;
|
||||||
linestart = cursor ;
|
|
||||||
lineend = cursor ;
|
capturebuf.resize( line_start );
|
||||||
|
append_unichar_vector_to_utf8( capturebuf, linevec );
|
||||||
|
line_start = capturebuf.size();
|
||||||
|
|
||||||
|
linevec.clear();
|
||||||
|
cursor = 0;
|
||||||
}
|
}
|
||||||
else if ( uc == '\x01' || uc == '\x02' )
|
else if ( uc == '\x01' || uc == '\x02' )
|
||||||
|
{
|
||||||
// Skip Ctrl-A and Ctrl-B chars e2fsck uses to bracket the progress bar
|
// Skip Ctrl-A and Ctrl-B chars e2fsck uses to bracket the progress bar
|
||||||
continue;
|
continue;
|
||||||
else {
|
}
|
||||||
if ( cursor < lineend ) {
|
else
|
||||||
buff.replace( cursor, 1, 1, uc );
|
{
|
||||||
|
if ( cursor < linevec.size() )
|
||||||
|
{
|
||||||
|
// Replace char in current line.
|
||||||
|
linevec[cursor] = uc;
|
||||||
cursor ++;
|
cursor ++;
|
||||||
}
|
}
|
||||||
else {
|
else
|
||||||
buff.append( 1, uc );
|
{
|
||||||
|
// Append char to current line.
|
||||||
|
linevec.push_back( uc );
|
||||||
cursor ++;
|
cursor ++;
|
||||||
lineend = cursor ;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Paste partial line to capture buffer; copy that to callers buffer; and
|
||||||
|
// fire any update callbacks.
|
||||||
|
capturebuf.resize( line_start );
|
||||||
|
append_unichar_vector_to_utf8( capturebuf, linevec );
|
||||||
|
|
||||||
|
callerbuf = capturebuf;
|
||||||
signal_update.emit();
|
signal_update.emit();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( status != Glib::IO_STATUS_EOF )
|
if ( status != Glib::IO_STATUS_EOF )
|
||||||
|
{
|
||||||
std::cerr << "Pipe IOChannel read failed" << std::endl;
|
std::cerr << "Pipe IOChannel read failed" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
// signal completion
|
// signal completion
|
||||||
signal_eof.emit();
|
signal_eof.emit();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PipeCapture::append_unichar_vector_to_utf8( std::string & str, const std::vector<gunichar> & ucvec )
|
||||||
|
{
|
||||||
|
const size_t MAX_UTF8_BYTES = 6;
|
||||||
|
char buf[MAX_UTF8_BYTES];
|
||||||
|
for ( unsigned int i = 0 ; i < ucvec.size() ; i ++ )
|
||||||
|
{
|
||||||
|
int bytes_written = g_unichar_to_utf8( ucvec[i], buf );
|
||||||
|
str.append( buf, bytes_written );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
PipeCapture::~PipeCapture()
|
PipeCapture::~PipeCapture()
|
||||||
{
|
{
|
||||||
|
delete[] readbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace GParted
|
} // namespace GParted
|
||||||
|
|
Loading…
Reference in New Issue