From df29f9b0c3a2277c2eb2de2e0bf4f4b3329dbd00 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 19 Nov 2024 14:48:48 +1100 Subject: [PATCH 1/3] Improve `fake_ident_or_unknown_prefix`. - Rename it as `invalid_ident_or_prefix`, which matches the possible outputs (`InvalidIdent` or `InvalidPrefix`). - Use the local wrapper for `is_xid_continue`, for consistency. - Make it clear what `\u{200d}` means. --- compiler/rustc_lexer/src/lib.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index f9f2a14dbd2..7f221098364 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -468,7 +468,7 @@ impl Cursor<'_> { Literal { kind, suffix_start } } // Identifier starting with an emoji. Only lexed for graceful error recovery. - c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(), _ => Unknown, }; let res = Token::new(token_kind, self.pos_within_token()); @@ -552,17 +552,16 @@ impl Cursor<'_> { // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, - c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(), _ => Ident, } } - fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind { + fn invalid_ident_or_prefix(&mut self) -> TokenKind { // Start is already eaten, eat the rest of identifier. self.eat_while(|c| { - unicode_xid::UnicodeXID::is_xid_continue(c) - || (!c.is_ascii() && c.is_emoji_char()) - || c == '\u{200d}' + const ZERO_WIDTH_JOINER: char = '\u{200d}'; + is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER }); // Known prefixes must have been handled earlier. So if // we see a prefix here, it is definitely an unknown prefix. From 2c7c3697dbaac3e0599aa0e7cd3581822caec17b Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 19 Nov 2024 15:11:18 +1100 Subject: [PATCH 2/3] Improve `TokenKind` comments. - Improve wording. - Use backticks consistently for examples. --- compiler/rustc_lexer/src/lib.rs | 80 ++++++++++++++++----------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 7f221098364..c01dad810c4 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -57,11 +57,10 @@ impl Token { /// Enum representing common lexeme types. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TokenKind { - // Multi-char tokens: - /// "// comment" + /// A line comment, e.g. `// comment`. LineComment { doc_style: Option }, - /// `/* block comment */` + /// A block comment, e.g. `/* block comment */`. /// /// Block comments can be recursive, so a sequence like `/* /* */` /// will not be considered terminated and will result in a parsing error. @@ -70,18 +69,17 @@ pub enum TokenKind { /// Any whitespace character sequence. Whitespace, - /// "ident" or "continue" - /// - /// At this step, keywords are also considered identifiers. + /// An identifier or keyword, e.g. `ident` or `continue`. Ident, - /// Like the above, but containing invalid unicode codepoints. + /// An identifier that is invalid because it contains emoji. InvalidIdent, - /// "r#ident" + /// A raw identifier, e.g. "r#ident". RawIdent, - /// An unknown prefix, like `foo#`, `foo'`, `foo"`. + /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes + /// literal prefixes that contain emoji, which are considered "invalid". /// /// Note that only the /// prefix (`foo`) is included in the token, not the separator (which is @@ -93,11 +91,12 @@ pub enum TokenKind { /// An unknown prefix in a lifetime, like `'foo#`. /// - /// Note that like above, only the `'` and prefix are included in the token + /// Like `UnknownPrefix`, only the `'` and prefix are included in the token /// and not the separator. UnknownPrefixLifetime, - /// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`. + /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into + /// several tokens: `'r` and `#` and `foo`. RawLifetime, /// Similar to the above, but *always* an error on every edition. This is used @@ -110,70 +109,69 @@ pub enum TokenKind { /// Split into the component tokens on older editions. GuardedStrPrefix, - /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid + /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid /// suffix, but may be present here on string and float literals. Users of /// this type will need to check for and reject that case. /// /// See [LiteralKind] for more details. Literal { kind: LiteralKind, suffix_start: u32 }, - /// "'a" + /// A lifetime, e.g. `'a`. Lifetime { starts_with_number: bool }, - // One-char tokens: - /// ";" + /// `;` Semi, - /// "," + /// `,` Comma, - /// "." + /// `.` Dot, - /// "(" + /// `(` OpenParen, - /// ")" + /// `)` CloseParen, - /// "{" + /// `{` OpenBrace, - /// "}" + /// `}` CloseBrace, - /// "[" + /// `[` OpenBracket, - /// "]" + /// `]` CloseBracket, - /// "@" + /// `@` At, - /// "#" + /// `#` Pound, - /// "~" + /// `~` Tilde, - /// "?" + /// `?` Question, - /// ":" + /// `:` Colon, - /// "$" + /// `$` Dollar, - /// "=" + /// `=` Eq, - /// "!" + /// `!` Bang, - /// "<" + /// `<` Lt, - /// ">" + /// `>` Gt, - /// "-" + /// `-` Minus, - /// "&" + /// `&` And, - /// "|" + /// `|` Or, - /// "+" + /// `+` Plus, - /// "*" + /// `*` Star, - /// "/" + /// `/` Slash, - /// "^" + /// `^` Caret, - /// "%" + /// `%` Percent, /// Unknown token, not expected by the lexer, e.g. "№" From e9a0c3c98c5640070e15a3cb38860a7268c1dca2 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 19 Nov 2024 15:55:34 +1100 Subject: [PATCH 3/3] Remove `TokenKind::InvalidPrefix`. It was added in #123752 to handle some cases involving emoji, but it isn't necessary because it's always treated the same as `TokenKind::InvalidIdent`. This commit removes it, which makes things a little simpler. --- compiler/rustc_lexer/src/lib.rs | 21 +++++++------------ compiler/rustc_parse/src/lexer/mod.rs | 5 ++--- src/librustdoc/html/highlight.rs | 7 +++---- .../crates/parser/src/lexed_str.rs | 2 +- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index c01dad810c4..bcb103957ba 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -99,10 +99,6 @@ pub enum TokenKind { /// several tokens: `'r` and `#` and `foo`. RawLifetime, - /// Similar to the above, but *always* an error on every edition. This is used - /// for emoji identifier recovery, as those are not meant to be ever accepted. - InvalidPrefix, - /// Guarded string literal prefix: `#"` or `##`. /// /// Used for reserving "guarded strings" (RFC 3598) in edition 2024. @@ -466,7 +462,7 @@ impl Cursor<'_> { Literal { kind, suffix_start } } // Identifier starting with an emoji. Only lexed for graceful error recovery. - c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Unknown, }; let res = Token::new(token_kind, self.pos_within_token()); @@ -550,23 +546,22 @@ impl Cursor<'_> { // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, - c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Ident, } } - fn invalid_ident_or_prefix(&mut self) -> TokenKind { + fn invalid_ident(&mut self) -> TokenKind { // Start is already eaten, eat the rest of identifier. self.eat_while(|c| { const ZERO_WIDTH_JOINER: char = '\u{200d}'; is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER }); - // Known prefixes must have been handled earlier. So if - // we see a prefix here, it is definitely an unknown prefix. - match self.first() { - '#' | '"' | '\'' => InvalidPrefix, - _ => InvalidIdent, - } + // An invalid identifier followed by '#' or '"' or '\'' could be + // interpreted as an invalid literal prefix. We don't bother doing that + // because the treatment of invalid identifiers and invalid prefixes + // would be the same. + InvalidIdent } fn c_or_byte_string( diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 226de65445c..5023e83bd67 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { let ident = Symbol::intern(lifetime_name); token::Lifetime(ident, IdentIsRaw::No) } - rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix + rustc_lexer::TokenKind::InvalidIdent // Do not recover an identifier with emoji if the codepoint is a confusable // with a recoverable substitution token, like `➖`. if !UNICODE_ARRAY.iter().any(|&(c, _, _)| { @@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), rustc_lexer::TokenKind::Unknown - | rustc_lexer::TokenKind::InvalidIdent - | rustc_lexer::TokenKind::InvalidPrefix => { + | rustc_lexer::TokenKind::InvalidIdent => { // Don't emit diagnostics for sequences of the same invalid token if swallow_next_invalid > 0 { swallow_next_invalid -= 1; diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 4def80764ea..29f6f92a6b2 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -861,10 +861,9 @@ impl<'src> Classifier<'src> { }, Some(c) => c, }, - TokenKind::RawIdent - | TokenKind::UnknownPrefix - | TokenKind::InvalidPrefix - | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)), + TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { + Class::Ident(self.new_span(before, text)) + } TokenKind::Lifetime { .. } | TokenKind::RawLifetime | TokenKind::UnknownPrefixLifetime => Class::Lifetime, diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index 3c0eb1b42a6..c97596d5097 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -183,7 +183,7 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::Ident => { SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT) } - rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => { + rustc_lexer::TokenKind::InvalidIdent => { err = "Ident contains invalid characters"; IDENT }