core/str/lossy.rs
1use super::from_utf8_unchecked;
2use super::validations::utf8_char_width;
3use crate::fmt;
4use crate::fmt::{Formatter, Write};
5use crate::iter::FusedIterator;
6
7impl [u8] {
8    /// Creates an iterator over the contiguous valid UTF-8 ranges of this
9    /// slice, and the non-UTF-8 fragments in between.
10    ///
11    /// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
12    ///
13    /// # Examples
14    ///
15    /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
16    /// code in the form of a C-string literal (`c"..."`).
17    ///
18    /// ```
19    /// use std::fmt::Write as _;
20    ///
21    /// pub fn cstr_literal(bytes: &[u8]) -> String {
22    ///     let mut repr = String::new();
23    ///     repr.push_str("c\"");
24    ///     for chunk in bytes.utf8_chunks() {
25    ///         for ch in chunk.valid().chars() {
26    ///             // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
27    ///             write!(repr, "{}", ch.escape_debug()).unwrap();
28    ///         }
29    ///         for byte in chunk.invalid() {
30    ///             write!(repr, "\\x{:02X}", byte).unwrap();
31    ///         }
32    ///     }
33    ///     repr.push('"');
34    ///     repr
35    /// }
36    ///
37    /// fn main() {
38    ///     let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
39    ///     let expected = stringify!(c"\xFErris the 🦀\u{7}");
40    ///     assert_eq!(lit, expected);
41    /// }
42    /// ```
43    #[stable(feature = "utf8_chunks", since = "1.79.0")]
44    pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
45        Utf8Chunks { source: self }
46    }
47}
48
49/// An item returned by the [`Utf8Chunks`] iterator.
50///
51/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
52/// when decoding a UTF-8 string.
53///
54/// # Examples
55///
56/// ```
57/// // An invalid UTF-8 string
58/// let bytes = b"foo\xF1\x80bar";
59///
60/// // Decode the first `Utf8Chunk`
61/// let chunk = bytes.utf8_chunks().next().unwrap();
62///
63/// // The first three characters are valid UTF-8
64/// assert_eq!("foo", chunk.valid());
65///
66/// // The fourth character is broken
67/// assert_eq!(b"\xF1\x80", chunk.invalid());
68/// ```
69#[stable(feature = "utf8_chunks", since = "1.79.0")]
70#[derive(Clone, Debug, PartialEq, Eq)]
71pub struct Utf8Chunk<'a> {
72    valid: &'a str,
73    invalid: &'a [u8],
74}
75
76impl<'a> Utf8Chunk<'a> {
77    /// Returns the next validated UTF-8 substring.
78    ///
79    /// This substring can be empty at the start of the string or between
80    /// broken UTF-8 characters.
81    #[must_use]
82    #[stable(feature = "utf8_chunks", since = "1.79.0")]
83    pub fn valid(&self) -> &'a str {
84        self.valid
85    }
86
87    /// Returns the invalid sequence that caused a failure.
88    ///
89    /// The returned slice will have a maximum length of 3 and starts after the
90    /// substring given by [`valid`]. Decoding will resume after this sequence.
91    ///
92    /// If empty, this is the last chunk in the string. If non-empty, an
93    /// unexpected byte was encountered or the end of the input was reached
94    /// unexpectedly.
95    ///
96    /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
97    /// CHARACTER`].
98    ///
99    /// [`valid`]: Self::valid
100    /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
101    #[must_use]
102    #[stable(feature = "utf8_chunks", since = "1.79.0")]
103    pub fn invalid(&self) -> &'a [u8] {
104        self.invalid
105    }
106}
107
108#[must_use]
109#[unstable(feature = "str_internals", issue = "none")]
110pub struct Debug<'a>(&'a [u8]);
111
112#[unstable(feature = "str_internals", issue = "none")]
113impl fmt::Debug for Debug<'_> {
114    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
115        f.write_char('"')?;
116
117        for chunk in self.0.utf8_chunks() {
118            // Valid part.
119            // Here we partially parse UTF-8 again which is suboptimal.
120            {
121                let valid = chunk.valid();
122                let mut from = 0;
123                for (i, c) in valid.char_indices() {
124                    let esc = c.escape_debug();
125                    // If char needs escaping, flush backlog so far and write, else skip
126                    if esc.len() != 1 {
127                        f.write_str(&valid[from..i])?;
128                        for c in esc {
129                            f.write_char(c)?;
130                        }
131                        from = i + c.len_utf8();
132                    }
133                }
134                f.write_str(&valid[from..])?;
135            }
136
137            // Broken parts of string as hex escape.
138            for &b in chunk.invalid() {
139                write!(f, "\\x{:02X}", b)?;
140            }
141        }
142
143        f.write_char('"')
144    }
145}
146
147/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
148/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
149///
150/// If you want a simple conversion from UTF-8 byte slices to string slices,
151/// [`from_utf8`] is easier to use.
152///
153/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
154///
155/// [byteslice]: slice
156/// [`from_utf8`]: super::from_utf8
157///
158/// # Examples
159///
160/// This can be used to create functionality similar to
161/// [`String::from_utf8_lossy`] without allocating heap memory:
162///
163/// ```
164/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
165///     for chunk in input.utf8_chunks() {
166///         push(chunk.valid());
167///
168///         if !chunk.invalid().is_empty() {
169///             push("\u{FFFD}");
170///         }
171///     }
172/// }
173/// ```
174///
175/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
176#[must_use = "iterators are lazy and do nothing unless consumed"]
177#[stable(feature = "utf8_chunks", since = "1.79.0")]
178#[derive(Clone)]
179pub struct Utf8Chunks<'a> {
180    source: &'a [u8],
181}
182
183impl<'a> Utf8Chunks<'a> {
184    #[doc(hidden)]
185    #[unstable(feature = "str_internals", issue = "none")]
186    pub fn debug(&self) -> Debug<'_> {
187        Debug(self.source)
188    }
189}
190
191#[stable(feature = "utf8_chunks", since = "1.79.0")]
192impl<'a> Iterator for Utf8Chunks<'a> {
193    type Item = Utf8Chunk<'a>;
194
195    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
196        if self.source.is_empty() {
197            return None;
198        }
199
200        const TAG_CONT_U8: u8 = 128;
201        fn safe_get(xs: &[u8], i: usize) -> u8 {
202            *xs.get(i).unwrap_or(&0)
203        }
204
205        let mut i = 0;
206        let mut valid_up_to = 0;
207        while i < self.source.len() {
208            // SAFETY: `i < self.source.len()` per previous line.
209            // For some reason the following are both significantly slower:
210            // while let Some(&byte) = self.source.get(i) {
211            // while let Some(byte) = self.source.get(i).copied() {
212            let byte = unsafe { *self.source.get_unchecked(i) };
213            i += 1;
214
215            if byte < 128 {
216                // This could be a `1 => ...` case in the match below, but for
217                // the common case of all-ASCII inputs, we bypass loading the
218                // sizeable UTF8_CHAR_WIDTH table into cache.
219            } else {
220                let w = utf8_char_width(byte);
221
222                match w {
223                    2 => {
224                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
225                            break;
226                        }
227                        i += 1;
228                    }
229                    3 => {
230                        match (byte, safe_get(self.source, i)) {
231                            (0xE0, 0xA0..=0xBF) => (),
232                            (0xE1..=0xEC, 0x80..=0xBF) => (),
233                            (0xED, 0x80..=0x9F) => (),
234                            (0xEE..=0xEF, 0x80..=0xBF) => (),
235                            _ => break,
236                        }
237                        i += 1;
238                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
239                            break;
240                        }
241                        i += 1;
242                    }
243                    4 => {
244                        match (byte, safe_get(self.source, i)) {
245                            (0xF0, 0x90..=0xBF) => (),
246                            (0xF1..=0xF3, 0x80..=0xBF) => (),
247                            (0xF4, 0x80..=0x8F) => (),
248                            _ => break,
249                        }
250                        i += 1;
251                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
252                            break;
253                        }
254                        i += 1;
255                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
256                            break;
257                        }
258                        i += 1;
259                    }
260                    _ => break,
261                }
262            }
263
264            valid_up_to = i;
265        }
266
267        // SAFETY: `i <= self.source.len()` because it is only ever incremented
268        // via `i += 1` and in between every single one of those increments, `i`
269        // is compared against `self.source.len()`. That happens either
270        // literally by `i < self.source.len()` in the while-loop's condition,
271        // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
272        // loop is terminated as soon as the latest `i += 1` has made `i` no
273        // longer less than `self.source.len()`, which means it'll be at most
274        // equal to `self.source.len()`.
275        let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
276        self.source = remaining;
277
278        // SAFETY: `valid_up_to <= i` because it is only ever assigned via
279        // `valid_up_to = i` and `i` only increases.
280        let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
281
282        Some(Utf8Chunk {
283            // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
284            valid: unsafe { from_utf8_unchecked(valid) },
285            invalid,
286        })
287    }
288}
289
290#[stable(feature = "utf8_chunks", since = "1.79.0")]
291impl FusedIterator for Utf8Chunks<'_> {}
292
293#[stable(feature = "utf8_chunks", since = "1.79.0")]
294impl fmt::Debug for Utf8Chunks<'_> {
295    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
296        f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
297    }
298}