From 3a5dcfc0161a7b3a21be33e27674d498da895197 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Thu, 28 Apr 2022 22:03:37 -0400 Subject: [PATCH] tamer: diagnose::resolver::SourceLine: {Vec=>String} Using a byte vector just makes life more difficult with regard to preparing the diagnostic reports. We're already validating UTF-8 data for column generation, which is necessary for a robust report, so let's just store it as a String to begin with. DEV-12151 --- tamer/src/diagnose/resolver.rs | 36 +++++++++++++++-------------- tamer/src/diagnose/resolver/test.rs | 2 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/tamer/src/diagnose/resolver.rs b/tamer/src/diagnose/resolver.rs index 3666b8e8..0807fff0 100644 --- a/tamer/src/diagnose/resolver.rs +++ b/tamer/src/diagnose/resolver.rs @@ -257,8 +257,8 @@ pub struct SourceLine { /// The [`Span`] representing the entire source line. span: Span, - /// Source code text of the line _excluding_ the newline. - text: Vec, + /// Source code text of the line. + text: String, } impl SourceLine { @@ -271,7 +271,7 @@ impl SourceLine { num: NonZeroU32, column: Option, span: Span, - text: Vec, + text: String, ) -> Self { Self { num, @@ -284,14 +284,12 @@ impl SourceLine { impl Display for SourceLine { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - // TODO: Just store String instead of a byte vector so we're not - // validating UTF-8 twice. - let s = String::from_utf8_lossy(&self.text); + let text = &self.text; - let disp = if self.text.last() == Some(&b'\n') { - &s[0..s.len() - 1] + let disp = if text.chars().last() == Some('\n') { + &text[0..text.len() - 1] } else { - &s[..] + &text[..] }; f.write_str(disp) @@ -571,19 +569,15 @@ impl Line { /// Produce formatted output for a line containing invalid UTF-8 data. /// - /// This still produces the actual line so that we can help the user - /// track down the invalid byte sequences, - /// but it is unable to resolve columns. - /// /// This is delegated to by [`Line::format_for`]. fn format_invalid_utf8_for( &mut self, span: Span, - ) -> (Vec, Span, Option) { + ) -> (String, Span, Option) { let bytes = self.take_buf().unwrap_or(vec![]); let span = span.context().span_or_zz(0, bytes.len()); - (bytes, span, None) + (String::from_utf8_lossy(&bytes).into(), span, None) } /// Format the line buffer and provide an associated line [`Span`] @@ -593,7 +587,7 @@ impl Line { /// this means that any trailing newline will be stripped unless it is /// directly referenced by the `span` offset /// (starts _at_ the newline). - fn format_for(&mut self, span: Span) -> (Vec, Span, Option) { + fn format_for(&mut self, span: Span) -> (String, Span, Option) { // Trim any newline unless the span starts at the last byte, // which would reference the newline character itself. if span.offset() as usize != self.total_offset_read() { @@ -611,7 +605,15 @@ impl Line { let buf = self.take_buf().unwrap_or(vec![]); let line_span = span.context().span_or_zz(offset_start, buf.len()); - (buf, line_span, Some(column)) + // SAFETY: We have already verified above, + // by `line`, + // that this represents a valid UTF-8 string, + // otherwise we would not have reached this line. + // Refactoring could likely avoid the need to convert to a string + // separately from the original check. + let strbuf = unsafe { String::from_utf8_unchecked(buf) }; + + (strbuf, line_span, Some(column)) } /// Determine the [`Span`] endpoint offsets relative to the line start. diff --git a/tamer/src/diagnose/resolver/test.rs b/tamer/src/diagnose/resolver/test.rs index 9b950bea..91d317bc 100644 --- a/tamer/src/diagnose/resolver/test.rs +++ b/tamer/src/diagnose/resolver/test.rs @@ -610,7 +610,7 @@ fn invalid_unicode_no_column() { // Make sure we're still trimming despite the // error. buf.pop(); - buf.into() + String::from_utf8_lossy(&buf).into() }, }]) .unwrap(),