From b1cad17a144498294ea66614bd3f7286469bf717 Mon Sep 17 00:00:00 2001 From: Mike Fleetwood Date: Sat, 20 Feb 2021 23:30:41 +0000 Subject: [PATCH] Refactor ::OnReadable() creating get_utf8_char_validated() (#136) Extract call to GLib's g_utf8_get_char_validated() and the associated workaround to also read NUL characters into a separate function to make PipeCapture::OnReadable() a little smaller and simpler, so easier to understand. Add max_len > 0 clause into get_utf8_char_validated() like this: if (uc == UTF8_PARTIAL && max_len > 0) so that the NUL character reading workaround is only applied when max_len specifies the maximum number of bytes to read, rather than when -1 specifies reading a NUL termination string. This makes get_utf8_char_validated() a complete wrapper of g_utf8_get_char_validated() [1], even though GParted always specifies the maximum number of bytes to read. No longer describe the inability to read NUL characters as a bug [2] since the GLib author's said it wasn't [3]. [1] GLib Reference Manual, Unicode Manipulation Functions, g_utf8_get_char_validated () https://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-utf8-get-char-validated [2] 8dbbb47ce2db0ee733ff909c1ead2f4de9475596 Workaround g_utf8_get_char_validate() bug with embedded NUL bytes (#777973) [3] Bug 780095 - g_utf8_get_char_validated() stopping at nul byte even for length specified buffers https://bugzilla.gnome.org/show_bug.cgi?id=780095#18 "If g_utf8_get_char_validated() encounters a nul byte in the middle of a string of given longer length, it returns -2, indicating a partial gunichar. That is not the obvious behaviour, but since g_utf8_get_char_validated() has been API for a long time, the behaviour cannot be changed. " Closes #136 - 1.2.0: test suite is failing in test_PipeCapture --- include/PipeCapture.h | 1 + src/PipeCapture.cc | 44 ++++++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/include/PipeCapture.h b/include/PipeCapture.h index 95fc763a..1be1c83f 100644 --- a/include/PipeCapture.h +++ b/include/PipeCapture.h @@ -45,6 +45,7 @@ private: gpointer data ); static void append_unichar_vector_to_utf8( std::string & str, const std::vector & ucvec ); + static gunichar get_utf8_char_validated(const char *p, gssize max_len); static int utf8_char_length( unsigned char firstbyte ); Glib::RefPtr channel; // Wrapper around fd diff --git a/src/PipeCapture.cc b/src/PipeCapture.cc index 52e45cc1..d5b6f4f2 100644 --- a/src/PipeCapture.cc +++ b/src/PipeCapture.cc @@ -30,6 +30,11 @@ namespace GParted { const size_t READBUF_SIZE = 64*KIBIBYTE; + +const gunichar UTF8_PARTIAL = (gunichar)-2; +const gunichar UTF8_INVALID = (gunichar)-1; + + PipeCapture::PipeCapture( int fd, Glib::ustring &buffer ) : fill_offset( 0 ), cursor( 0 ), line_start( 0 ), @@ -125,21 +130,7 @@ bool PipeCapture::OnReadable( Glib::IOCondition condition ) fill_offset = 0; while ( read_ptr < end_ptr ) { - const gunichar UTF8_PARTIAL = (gunichar)-2; - const gunichar UTF8_INVALID = (gunichar)-1; - gunichar uc = g_utf8_get_char_validated( read_ptr, end_ptr - read_ptr ); - if ( uc == UTF8_PARTIAL ) - { - // Workaround bug in g_utf8_get_char_validated() in which - // it reports an partial UTF-8 char when a NUL byte is - // encountered in the middle of a multi-byte character, - // yet there are more bytes available in the length - // specified buffer. Report as invalid character instead. - int len = utf8_char_length( *read_ptr ); - if ( len == -1 || read_ptr + len <= end_ptr ) - uc = UTF8_INVALID; - } - + gunichar uc = get_utf8_char_validated(read_ptr, end_ptr - read_ptr); if ( uc == UTF8_PARTIAL ) { // Partial UTF-8 character at end of read buffer. Copy to @@ -258,6 +249,29 @@ void PipeCapture::append_unichar_vector_to_utf8( std::string & str, const std::v } } + +// GLib's g_utf8_get_char_validated() always considers strings as being NUL terminated, +// even when max_len is specified, hence can't read NUL characters. This wrapper can read +// NUL characters when max_len is specified. +// Reference: +// https://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-utf8-get-char-validated +gunichar PipeCapture::get_utf8_char_validated(const char *p, gssize max_len) +{ + gunichar uc = g_utf8_get_char_validated(p, max_len); + if (uc == UTF8_PARTIAL && max_len > 0) + { + // If g_utf8_get_char_validated() found a NUL byte in the middle of a + // multi-byte character, even when there are more bytes available as + // specified by max_len, it reports a partial UTF-8 character. Report + // this case as an invalid character instead. + int len = utf8_char_length(*p); + if (len == -1 || (gssize)len <= max_len) + uc = UTF8_INVALID; + } + return uc; +} + + int PipeCapture::utf8_char_length( unsigned char firstbyte ) { // Recognise the size of FSS-UTF (1992) / UTF-8 (1993) characters given the first