From 5b1f0ab6c613dae01c96f6e6e53540346617c042 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Thu, 21 Apr 2022 14:16:21 -0400 Subject: [PATCH] tamer: diagnostic: Column resolution Determining the column number is not as simple as performing byte arithmetic, because certain characters have different widths. Even if we only accepted ASCII, control characters aren't visible to the user. This uses the unicode-width crate as an alternative to POSIX wcwidth, to determine (hopefully) the number of fixed-width cells that a unicode character will take up on a terminal. For example, control characters are zero-width, while an emoji is likely double-width. See test cases for more information on that. There is also the unicode-segmentation crate, which can handle extended grapheme clusters and such, but (a) we'll be outputting the line to the terminal and (b) there's no guarantee that the user's editor displays grapheme clusters as a single column. LSP measures in UTF-16, apparently. I use both Emacs and Vim from a terminal, so unicode-width applies to me. There's too much variation to try to solve that right now. The columns can be considered a visual span---this gives us enough information to draw line annotations, which will happen soon. Here are some useful links: - https://hsivonen.fi/string-length/ - https://unicode.org/reports/tr29/ - https://github.com/rust-analyzer/rowan/issues/17 - https://www.reddit.com/r/rust/comments/gpw2ra/how_is_the_rust_compiler_able_to_tell_the_visible/ DEV-10935 --- tamer/Cargo.lock | 1 + tamer/Cargo.toml | 1 + tamer/src/diagnose/report.rs | 133 +++++++- tamer/src/diagnose/resolver.rs | 548 ++++++++++++++++++++++++++++++--- 4 files changed, 618 insertions(+), 65 deletions(-) diff --git a/tamer/Cargo.lock b/tamer/Cargo.lock index e90d426a..7c313bbb 100644 --- a/tamer/Cargo.lock +++ b/tamer/Cargo.lock @@ -141,6 +141,7 @@ dependencies = [ "petgraph-graphml", "quick-xml", "static_assertions", + "unicode-width", ] [[package]] diff --git a/tamer/Cargo.toml b/tamer/Cargo.toml index bfdeeb6a..5ae729a0 100644 --- a/tamer/Cargo.toml +++ b/tamer/Cargo.toml @@ -35,6 +35,7 @@ petgraph-graphml = "3.0.0" static_assertions = ">= 1.1.0" memchr = ">= 2.3.4" # quick-xml expects =2.3.4 at the time paste = ">= 1.0.5" +unicode-width = "0.1.5" # Feature flags can be specified using `./configure FEATURES=foo,bar,baz`. # diff --git a/tamer/src/diagnose/report.rs b/tamer/src/diagnose/report.rs index d8bbd111..ccb97e6b 100644 --- a/tamer/src/diagnose/report.rs +++ b/tamer/src/diagnose/report.rs @@ -19,7 +19,9 @@ //! Rendering of diagnostic information. -use super::{AnnotatedSpan, Diagnostic, Label, Level, SpanResolver}; +use super::{ + AnnotatedSpan, Diagnostic, Label, Level, ResolvedSpan, SpanResolver, +}; use crate::span::{Span, SpanOffsetSize, UNKNOWN_SPAN}; use std::fmt::{self, Write}; @@ -97,6 +99,61 @@ impl VisualReporter { ) -> fmt::Result { writeln!(to, " {level}: {label}") } + + /// Attempt to render column offset. + /// + /// The happy path simply outputs `":N\n"`, + /// where `N` is the column number. + /// + /// If the column is not available, + /// then the line did not contain valid UTF-8. + /// In this case, + /// raw relative byte offsets are output along with help information + /// notifying the user of the issue; + /// this is hopefully enough information to quickly diagnose the + /// problem. + fn render_col(to: &mut impl Write, rspan: ResolvedSpan) -> fmt::Result { + let span = rspan.span; + + match rspan.col() { + Some(col) => writeln!(to, ":{}", col)?, + + // The column is unavailable, + // which means that the line must have contained invalid UTF-8. + // Output what we can in an attempt to help the user debug. + None => { + let rel = rspan + .first_line_span() + .and_then(|lspan| span.relative_to(lspan)) + .unwrap_or(UNKNOWN_SPAN); + + writeln!( + to, + " bytes {}--{}", + rel.offset(), + rel.endpoints_saturated().1.offset() + )?; + + Self::render_label( + to, + Level::Help, + "unable to calculate columns because the line is \ + not a valid UTF-8 string" + .into(), + )?; + + Self::render_label( + to, + Level::Help, + "you have been provided with 0-indexed \ + line-relative inclusive byte offsets" + .into(), + )?; + } + } + + Ok(()) + } } impl Reporter for VisualReporter { @@ -137,7 +194,8 @@ impl Reporter for VisualReporter { } Ok(rspan) => match rspan.line() { Some(line) => { - writeln!(to, ":{}", line)?; + write!(to, ":{}", line)?; + Self::render_col(to, rspan)?; } None => Self::render_fallback_span_offset(to, span)?, }, @@ -223,15 +281,20 @@ mod test { // len: 14 const FILE_BAR_BAZ: &[u8] = - b"bar/baz line 1\nbar/baz line2\nbar/baz line3\nbar/baz line4"; + b"bar/baz line 1\nbar/baz line 2\nbar/baz line 3\nbar/baz line 4"; // Offsets for this are the same as `FILE_FOO_BAR`. + const FILE_INVALID_UTF8: &[u8] = b"bad \xC0!"; + // |---- | + // 0 5 + macro_rules! assert_report { ($msg:expr, $aspans:expr, $expected:expr) => { let mut resolver = HashMap::>::new(); let ctx_foo_bar = Context::from("foo/bar"); let ctx_bar_baz = Context::from("bar/baz"); + let ctx_inv_utf = Context::from("invalid/utf8"); resolver.insert( ctx_foo_bar, @@ -241,6 +304,13 @@ mod test { ctx_bar_baz, BufSpanResolver::new(Cursor::new(FILE_BAR_BAZ), ctx_bar_baz), ); + resolver.insert( + ctx_inv_utf, + BufSpanResolver::new( + Cursor::new(FILE_INVALID_UTF8), + ctx_inv_utf, + ), + ); let mut sut = VisualReporter::new(resolver); @@ -271,7 +341,7 @@ mod test { // Context and span are rendered without a label. "\ error: single span no label - --> foo/bar:4 + --> foo/bar:4:6 " ); } @@ -286,7 +356,7 @@ error: single span no label // Context and span are rendered without a label. "\ error: single span with label - --> bar/baz:3 + --> bar/baz:3:1 error: span label here " ); @@ -306,7 +376,7 @@ error: single span with label // duplicate spans without some additional context. "\ error: multiple adjacent same span no label - --> foo/bar:4 + --> foo/bar:4:6 " ); } @@ -327,7 +397,7 @@ error: multiple adjacent same span no label // spans are the same. "\ error: multiple adjacent same span with labels - --> bar/baz:1 + --> bar/baz:1:11 error: A label error: C label " @@ -356,14 +426,14 @@ error: multiple adjacent same span with labels ], "\ error: eq context neq offset/len - --> bar/baz:1 + --> bar/baz:1:11 error: A, first label - --> bar/baz:1 + --> bar/baz:1:11 error: B, different length error: B, collapse - --> bar/baz:2 + --> bar/baz:2:1 error: C, different offset - --> bar/baz:1 + --> bar/baz:1:11 error: B', not adjacent " ); @@ -401,16 +471,16 @@ error: eq context neq offset/len ], "\ error: multiple adjacent different context - --> foo/bar:1 + --> foo/bar:1:11 error: A, first error: A, collapsed - --> bar/baz:1 + --> bar/baz:1:11 error: B, first error: B, collapsed - --> foo/bar:1 + --> foo/bar:1:11 error: A, not collapsed - --> bar/baz:1 - --> foo/bar:1 + --> bar/baz:1:11 + --> foo/bar:1:11 " ); } @@ -430,7 +500,7 @@ error: multiple adjacent different context ], "\ error: multiple spans with labels of different severity level - --> foo/bar:4 + --> foo/bar:4:6 internal error: an internal error error: an error note: a note @@ -475,4 +545,33 @@ error: unresolvable context fallback ") ); } + + /// If the span columns cannot be determined, + /// we can still display everything else. + /// Such a thing should only happen if the line contains invalid UTF-8, + /// so we want to be able to help the user track down the invalid byte. + #[test] + fn fallback_when_column_fails_to_resolve() { + let ctx = Context::from("invalid/utf8"); + + let span = ctx.span(4, 2); + + // It's not ideal that the help appears first, + // but this should only happen under very exceptional + // circumstances so it's not worth trying to resolve. + // If you're reading this and it's trivial to swap these with the + // current state of the system, + // go for it. + assert_report!( + "column resolution failure", + vec![span.error("an error we do not want to suppress"),], + "\ +error: column resolution failure + --> invalid/utf8:1 bytes 4--6 + help: unable to calculate columns because the line is not a valid UTF-8 string + help: you have been provided with 0-indexed line-relative inclusive byte offsets + error: an error we do not want to suppress +" + ); + } } diff --git a/tamer/src/diagnose/resolver.rs b/tamer/src/diagnose/resolver.rs index 3947e74c..9db1844f 100644 --- a/tamer/src/diagnose/resolver.rs +++ b/tamer/src/diagnose/resolver.rs @@ -29,7 +29,9 @@ use std::{ io::{self, BufRead, BufReader, Seek}, mem::take, num::NonZeroU32, + str::Utf8Error, }; +use unicode_width::UnicodeWidthChar; /// Resolves [`Span`]s into line:column source locations. /// @@ -97,25 +99,57 @@ pub struct ResolvedSpan { /// It should be the case that the [`Context`] of each [`SourceLine`] of /// this field is equal to the [`Context`] of the `span` field. pub lines: Vec, - - /// Column offset pair within the first and last [`SourceLine`]s. - /// - /// Column begins atย `1`, - /// so if the [`Span`] begins at the first byte within - /// `lines.first()`, - /// the first column will have a value ofย `1`. - /// The ending column represens the 1-indexed offset relative to - /// `lines.last()`. - /// - /// If there are no `lines` available, - /// then the columns are not known and will beย [`None`]. - pub columns: Option<(NonZeroU32, NonZeroU32)>, } impl ResolvedSpan { pub fn line(&self) -> Option { self.lines.get(0).map(|line| line.line) } + + pub fn col(&self) -> Option { + self.lines.get(0).and_then(|line| line.column) + } + + pub fn first_line_span(&self) -> Option { + self.lines.get(0).map(|line| line.span) + } +} + +/// Source column offsets. +/// +/// A "column" is somewhat loosely defined as a terminal cell. +/// Certain unicode characters occupy more than one cell, +/// while others occupy none. +/// Consequently, +/// a column can be thought of a "visual [`Span`]", +/// representing what the user would perceive as a column in a fixed +/// with font rather than a byte offset. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Column { + /// A 1-indexed column number. + At(NonZeroU32), + + /// A range of 1-indexed columns, inclusive. + Endpoints(NonZeroU32, NonZeroU32), + + /// Immediately before a column. + /// + /// This is conceptually like a bar cursor + /// (non-block) + /// that places itself between two columns. + /// It is caused by a zero-length [`Span`]. + Before(NonZeroU32), +} + +impl Display for Column { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + // Coerces to a single column number. + Self::At(at) | Self::Endpoints(at, _) | Self::Before(at) => { + Display::fmt(at, f) + } + } + } } #[derive(Debug, PartialEq, Eq)] @@ -123,18 +157,16 @@ pub struct SourceLine { /// 1-indexed line number relative to the entire source [`Context`]. line: NonZeroU32, + /// 1-indexed column number(s) relative to the beginning of the line. + /// + /// If the line contains invalid UTF-8, + /// this may be [`None`]. + column: Option, + /// The [`Span`] representing the entire source line. span: Span, /// Source code text of the line _excluding_ the newline. - /// - /// This is stored as a byte vector, - /// rather than a string, - /// so that we can still output source code verbatim even if it is - /// invalid UTF-8. - /// This could also allow for potential future enhancements, - /// like outputting binary data as stylized hexadecimal - /// (e.g. a future `xmlo` replacement). text: Vec, } @@ -233,10 +265,11 @@ impl SpanResolver for BufSpanResolver { while let Some(mut line) = self.read_next_line(buf, span)? { if line.at_or_beyond(span) { - let (text, line_span) = line.format_for(span); + let (text, line_span, column) = line.format_for(span); lines.push(SourceLine { line: self.line_num, + column, text, span: line_span, }); @@ -252,11 +285,7 @@ impl SpanResolver for BufSpanResolver { self.line_num = self.line_num.saturating_add(1); } - Ok(ResolvedSpan { - span, - lines, - columns: None, - }) + Ok(ResolvedSpan { span, lines }) } } @@ -318,6 +347,15 @@ impl LineBytes { | Self::WithEof(buf) => Some(buf), } } + + fn as_str(&self) -> Result<&str, Utf8Error> { + match self { + Self::Eof => Ok(&""), + Self::WithNewline(buf) + | Self::WithoutNewline(buf) + | Self::WithEof(buf) => std::str::from_utf8(buf), + } + } } impl From> for LineBytes { @@ -384,6 +422,31 @@ impl Line { self.total_offset_read() >= span.offset() as usize } + /// Line buffer as a UTF-8 slice. + fn line_as_str(&self) -> Result<&str, Utf8Error> { + self.bytes + .as_ref() + .map(LineBytes::as_str) + .unwrap_or(Ok(&"")) + } + + /// Produce formatted output for a line containing invalid UTF-8 data. + /// + /// This still produces the actual line so that we can help the user + /// track down the invalid byte sequences, + /// but it is unable to resolve columns. + /// + /// This is delegated to by [`Line::format_for`]. + fn format_invalid_utf8_for( + &mut self, + span: Span, + ) -> (Vec, Span, Option) { + let bytes = self.take_buf().unwrap_or(vec![]); + let span = span.ctx().span_or_zz(0, bytes.len()); + + (bytes, span, None) + } + /// Format the line buffer and provide an associated line [`Span`] /// that provides a source context suitable for the provided span. /// @@ -391,19 +454,126 @@ impl Line { /// this means that any trailing newline will be stripped unless it is /// directly referenced by the `span` offset /// (starts _at_ the newline). - fn format_for(&mut self, span: Span) -> (Vec, Span) { - // Trim the newline (if any) unless the span starts at the last - // byte, - // which would reference the newline character itself. + fn format_for(&mut self, span: Span) -> (Vec, Span, Option) { + // Trim any newline unless the span starts at the last byte, + // which would reference the newline character itself. if span.offset() as usize != self.total_offset_read() { self.bytes = take(&mut self.bytes).map(LineBytes::trim_nl); } + let line = match self.line_as_str() { + Ok(s) => s, + Err(_) => return self.format_invalid_utf8_for(span), + }; + + let column = self.resolve_columns(line, span); + let offset_start = self.offset_start; let buf = self.take_buf().unwrap_or(vec![]); - let span = span.ctx().span_or_zz(offset_start, buf.len()); + let line_span = span.ctx().span_or_zz(offset_start, buf.len()); - (buf, span) + (buf, line_span, Some(column)) + } + + /// Determine the [`Span`] endpoint offsets relative to the line start. + fn relative_byte_offsets(&self, span: Span) -> (usize, usize) { + let span_offset_end = + span.offset() as usize + span.len().max(1) as usize - 1; + + ( + (span.offset() as usize).saturating_sub(self.offset_start), + span_offset_end.saturating_sub(self.offset_start), + ) + } + + /// Determine the 1-indexed column number for each [`Span`] endpoint, + /// relative to the start of the line. + /// + /// For multi-line spans, + /// the column endpoints for the first line will continue to the end + /// of the line, + /// columns for the middle lines will encompass the entire line, + /// and the last line will begin at column 1. + /// That is: + /// + /// ```text + /// span start + /// v + /// line 1 + /// line 2 + /// line 4 + /// ^ span end + /// + /// # Will have its columns reported as: + /// line 1 + /// |-| [4,6] + /// line 2 + /// |----| [1,6] + /// line 4 + /// |^^| [1,4] + /// ``` + fn resolve_columns(&self, line: &str, span: Span) -> Column { + // The max(1) here is intended to accommodate zero-length spans. + let span_offset_end = + span.offset() as usize + span.len().max(1) as usize - 1; + + // We should stop calculating widths after this offset, + // which is EOL or the span ending offset, + // whichever comes first. + let max_offset = self.total_offset_read().min(span_offset_end); + + // This will produce `(index, width)` pairs for the line until we + // reach `max_offset` above. + let widths = line.char_indices().map_while(|(i, c)| { + (i <= max_offset).then(|| (i, c.width().unwrap_or(0))) + }); + + let (rel_start, rel_end) = self.relative_byte_offsets(span); + + // Count columns according to character widths in a single pass over + // the line. + // + // Note that this is summing the two column values _independently_; + // this is not the most efficient way to proceed, + // but it is good enough for our uses without starting to get + // creative, + // for which there are a number of possible approaches. + // Once we start processing spans in bulk + // (e.g. using the diagnostic system to produce information for + // every identifier in a file), + // additional optimizations will be needed anyway, + // so there's no use in doing something more complicated until + // we know specifically what use cases we'll be optimizing for. + let (start, end) = widths.fold((1, 0), |(start, end), (i, width)| { + ( + (i < rel_start).then(|| start + width).unwrap_or(start), + (i <= rel_end).then(|| end + width).unwrap_or(end), + ) + }); + + // If the system is operating correctly, + // both column endpoints should be non-zero. + // With that said, + // we never want the diagnostic system to panic, + // so play it safe anyway just in case. + // + // When we start processing spans in bulk we may wish to tighten our + // guarantees to eliminate these checks. + let (col_start, col_end) = ( + NonZeroU32::new(start.try_into().unwrap_or(0)) + .unwrap_or(NonZeroU32::MIN), + NonZeroU32::new(end.try_into().unwrap_or(0)) + .unwrap_or(NonZeroU32::MIN), + ); + + // Start will only be > end (by 1) if the span begins on a newline. + if span.len() == 0 { + Column::Before(col_start) + } else if col_start >= col_end { + Column::At(col_start) + } else { + Column::Endpoints(col_start, col_end) + } } /// Take ownership of the line buffer. @@ -543,10 +713,13 @@ mod test { span, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 4.unwrap_into() + )), span: ctx.span(7, 6), text: "line 2".into(), }], - columns: None, }), sut.resolve(span), ); @@ -570,10 +743,10 @@ mod test { span, lines: vec![SourceLine { line: 3.unwrap_into(), + column: Some(Column::At(6.unwrap_into(),)), span: ctx.span(14, 6), text: "line 3".into(), }], - columns: None, }), sut.resolve(span), ); @@ -600,17 +773,21 @@ mod test { span, lines: vec![SourceLine { line: 3.unwrap_into(), + column: Some(Column::Endpoints( + 3.unwrap_into(), + 6.unwrap_into() + )), span: ctx.span(14, 6), text: "line 3".into(), }], - columns: None, }), sut.resolve(span), ); } + // A first and last line. #[test] - fn multiple_lines() { + fn multiple_lines_first_last() { let ctx = Context::from("foobar"); let buf = "line 1\nline start 2\nend line 3"; // | |-----+- +-| | @@ -628,16 +805,80 @@ mod test { lines: vec![ SourceLine { line: 2.unwrap_into(), + // From the point, to the end of the line. + column: Some(Column::Endpoints( + 6.unwrap_into(), + 12.unwrap_into() + )), span: ctx.span(7, 12), text: "line start 2".into(), }, SourceLine { line: 3.unwrap_into(), + // From the beginning of the line, to the point. + column: Some(Column::Endpoints( + 1.unwrap_into(), + 3.unwrap_into() + )), + span: ctx.span(20, 10), + text: "end line 3".into(), + }, + ], + }), + sut.resolve(span), + ); + } + + // If there are more than two lines, + // middle lines' column ranges span the entire line. + #[test] + fn multiple_lines_middle_line_endpoints() { + let ctx = Context::from("foobar"); + let buf = "line start 1\nline 2\nend line 3"; + // | |-----+- +----+- +-| | + // | 5 | | | |22 | + // |----------| |----| |--------| + // 0 11 13 18 20 29 + + let span = ctx.span(5, 18); + + let mut sut = BufSpanResolver::new(Cursor::new(buf), ctx); + + assert_eq!( + Ok(ResolvedSpan { + span, + lines: vec![ + SourceLine { + line: 1.unwrap_into(), + // From the point, to the end of the line. + column: Some(Column::Endpoints( + 6.unwrap_into(), + 12.unwrap_into() + )), + span: ctx.span(0, 12), + text: "line start 1".into(), + }, + SourceLine { + line: 2.unwrap_into(), + // Entire line. + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), + span: ctx.span(13, 6), + text: "line 2".into(), + }, + SourceLine { + line: 3.unwrap_into(), + // From the beginning of the line, to the point. + column: Some(Column::Endpoints( + 1.unwrap_into(), + 3.unwrap_into() + )), span: ctx.span(20, 10), text: "end line 3".into(), }, ], - columns: None, }), sut.resolve(span), ); @@ -664,10 +905,13 @@ mod test { span, lines: vec![SourceLine { line: 1.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), span: ctx.span(0, 6), text: "line 1".into(), },], - columns: None, }), sut.resolve(span), ); @@ -697,6 +941,7 @@ mod test { span, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::At(7.unwrap_into())), // Trailing newline _is not_ stripped since it was // explicitly referenced; // we don't want our line span to not contain the @@ -704,7 +949,6 @@ mod test { span: ctx.span(7, 7), text: "line 2\n".into(), }], - columns: None, }), sut.resolve(span), ); @@ -731,10 +975,10 @@ mod test { span, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Before(4.unwrap_into())), span: ctx.span(7, 6), text: "line 2".into(), }], - columns: None, }), sut.resolve(span), ); @@ -763,6 +1007,7 @@ mod test { span, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Before(7.unwrap_into())), // Trailing newline _is not_ stripped since it was // explicitly referenced; // we don't want our line span to not contain the @@ -770,7 +1015,6 @@ mod test { span: ctx.span(7, 7), text: "line 2\n".into(), }], - columns: None, }), sut.resolve(span), ); @@ -799,10 +1043,10 @@ mod test { span, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Before(1.unwrap_into())), span: ctx.span(7, 6), text: "line 2".into(), }], - columns: None, }), sut.resolve(span), ); @@ -828,10 +1072,13 @@ mod test { span: span_a, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), span: span_a, text: "line 2".into(), }], - columns: None, }), sut.resolve(span_a), ); @@ -841,10 +1088,13 @@ mod test { span: span_b, lines: vec![SourceLine { line: 3.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), span: span_b, text: "line 3".into(), }], - columns: None, }), sut.resolve(span_b), ); @@ -868,10 +1118,13 @@ mod test { span: span, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), span: span, text: "line 2".into(), }], - columns: None, }), sut.resolve(span), ); @@ -897,10 +1150,13 @@ mod test { span: span_later, lines: vec![SourceLine { line: 2.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), span: span_later, text: "line 2".into(), }], - columns: None, }), sut.resolve(span_later), ); @@ -912,12 +1168,208 @@ mod test { span: span_earlier, lines: vec![SourceLine { line: 1.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 6.unwrap_into() + )), span: span_earlier, text: "line 1".into(), }], - columns: None, }), sut.resolve(span_earlier), ); } + + // We cannot properly determine the column if a line contains invalid + // unicode, + // because we cannot confidently determine how the line ought to be + // displayed to the user + // (that's up to their terminal). + // + // But we should display what we can, + // which means still producing the line itself, + // so that we can help the user track down the bad byte sequence that + // was almost certainly unintentional and may have even come from + // pasting text from another document. + #[test] + fn invalid_unicode_no_column() { + let ctx = Context::from("invalid-unicode"); + + let mut buf = b"bad \xC0!\n".to_vec(); + // |---- | + // 0 5 + + let span = ctx.span(0, 4); + + let mut sut = BufSpanResolver::new(Cursor::new(buf.clone()), ctx); + + assert_eq!( + Ok(ResolvedSpan { + span, + lines: vec![SourceLine { + line: 1.unwrap_into(), + column: None, + span: ctx.span(0, 6), + text: { + // Make sure we're still trimming despite the + // error. + buf.pop(); + buf.into() + }, + }], + }), + sut.resolve(span), + ); + } + + // Account for the width of unicode characters with a fixed-width font, + // in a manner similar to POSIX `wcwidth(3)`. + // TAMER uses the `unicode-width` crate, + // which is the same crate used by Rustc. + #[test] + fn unicode_width() { + let ctx = Context::from("unicode-width"); + + let buf = "0:\0\n1:โ€œ\n2:๐Ÿ˜Š"; + // |-| |-| |--| + // bytes: 0 2 4 8 10 15 + // col: 1 2 1 3 1 4 + + // Remember: spans are _byte_-oriented. + let span_0 = ctx.span(0, 3); + let span_1 = ctx.span(4, 5); + let span_2 = ctx.span(10, 6); + + let mut sut = BufSpanResolver::new(Cursor::new(buf), ctx); + + assert_eq!( + Ok(ResolvedSpan { + span: span_0, + lines: vec![SourceLine { + line: 1.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 2.unwrap_into() + )), + span: span_0, + text: "0:\0".into(), + }], + }), + sut.resolve(span_0), + ); + + assert_eq!( + Ok(ResolvedSpan { + span: span_1, + lines: vec![SourceLine { + line: 2.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 3.unwrap_into() + )), + span: span_1, + text: "1:โ€œ".into(), + }], + }), + sut.resolve(span_1), + ); + + assert_eq!( + Ok(ResolvedSpan { + span: span_2, + lines: vec![SourceLine { + line: 3.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 4.unwrap_into() + )), + span: span_2, + text: "2:๐Ÿ˜Š".into(), + }], + }), + sut.resolve(span_2), + ); + } + + // If a span somehow points to a byte that does not represent a valid + // UTF-8 character boundary, + // then we still want to produce sensible output. + // + // The behavior here is a consequence of implementation details. + // This test merely acknowledges the behavior to show that it has been + // considered, + // and to bring attention to the issue if the implementation details + // cause a change in behavior. + // At this time, + // there's no compelling reason to complicate the implementation to + // add additional checks that would produce more intuitive column + // values for these cases that are very unlikely to occur. + #[test] + fn at_invalid_char_boundary() { + let ctx = Context::from("unicode-width"); + + // Charcater is 4 bytes. + let buf = "(๐Ÿ˜Š)"; + // |--| + // bytes: 0 5 + // col: 1 4 + + // Ends at the first byte of the multibyte char. + let span_end_bad = ctx.span(0, 2); + // Starts at byte 2 of 4 for the multibyte char. + let span_start_bad = ctx.span(3, 2); + // _Both_ starts _and_ ends in the middle of the char. + let span_all_bad = ctx.span(2, 1); + + let line_span = ctx.span(0, 6); + + let mut sut = BufSpanResolver::new(Cursor::new(buf.clone()), ctx); + + assert_eq!( + Ok(ResolvedSpan { + span: span_end_bad, + lines: vec![SourceLine { + line: 1.unwrap_into(), + column: Some(Column::Endpoints( + 1.unwrap_into(), + 3.unwrap_into() + )), + span: line_span, + text: buf.clone().into(), + }], + }), + sut.resolve(span_end_bad), + ); + + assert_eq!( + Ok(ResolvedSpan { + span: span_start_bad, + lines: vec![SourceLine { + line: 1.unwrap_into(), + // Intuitively this really should be [2,4], + // but the implementation shouldn't change to + // accommodate this very unlikely case. + column: Some(Column::At(4.unwrap_into(),)), + span: line_span, + text: buf.clone().into(), + }], + }), + sut.resolve(span_start_bad), + ); + + assert_eq!( + Ok(ResolvedSpan { + span: span_all_bad, + lines: vec![SourceLine { + line: 1.unwrap_into(), + // Also unideal, + // but see comment for previous assertion. + column: Some(Column::At(4.unwrap_into(),)), + span: line_span, + text: buf.clone().into(), + }], + }), + sut.resolve(span_all_bad), + ); + } }