diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index f9f2a14dbd2..bcb103957ba 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -57,11 +57,10 @@ impl Token { /// Enum representing common lexeme types. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TokenKind { - // Multi-char tokens: - /// "// comment" + /// A line comment, e.g. `// comment`. LineComment { doc_style: Option }, - /// `/* block comment */` + /// A block comment, e.g. `/* block comment */`. /// /// Block comments can be recursive, so a sequence like `/* /* */` /// will not be considered terminated and will result in a parsing error. @@ -70,18 +69,17 @@ pub enum TokenKind { /// Any whitespace character sequence. Whitespace, - /// "ident" or "continue" - /// - /// At this step, keywords are also considered identifiers. + /// An identifier or keyword, e.g. `ident` or `continue`. Ident, - /// Like the above, but containing invalid unicode codepoints. + /// An identifier that is invalid because it contains emoji. InvalidIdent, - /// "r#ident" + /// A raw identifier, e.g. "r#ident". RawIdent, - /// An unknown prefix, like `foo#`, `foo'`, `foo"`. + /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes + /// literal prefixes that contain emoji, which are considered "invalid". /// /// Note that only the /// prefix (`foo`) is included in the token, not the separator (which is @@ -93,87 +91,83 @@ pub enum TokenKind { /// An unknown prefix in a lifetime, like `'foo#`. /// - /// Note that like above, only the `'` and prefix are included in the token + /// Like `UnknownPrefix`, only the `'` and prefix are included in the token /// and not the separator. UnknownPrefixLifetime, - /// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`. + /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into + /// several tokens: `'r` and `#` and `foo`. RawLifetime, - /// Similar to the above, but *always* an error on every edition. This is used - /// for emoji identifier recovery, as those are not meant to be ever accepted. - InvalidPrefix, - /// Guarded string literal prefix: `#"` or `##`. /// /// Used for reserving "guarded strings" (RFC 3598) in edition 2024. /// Split into the component tokens on older editions. GuardedStrPrefix, - /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid + /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid /// suffix, but may be present here on string and float literals. Users of /// this type will need to check for and reject that case. /// /// See [LiteralKind] for more details. Literal { kind: LiteralKind, suffix_start: u32 }, - /// "'a" + /// A lifetime, e.g. `'a`. Lifetime { starts_with_number: bool }, - // One-char tokens: - /// ";" + /// `;` Semi, - /// "," + /// `,` Comma, - /// "." + /// `.` Dot, - /// "(" + /// `(` OpenParen, - /// ")" + /// `)` CloseParen, - /// "{" + /// `{` OpenBrace, - /// "}" + /// `}` CloseBrace, - /// "[" + /// `[` OpenBracket, - /// "]" + /// `]` CloseBracket, - /// "@" + /// `@` At, - /// "#" + /// `#` Pound, - /// "~" + /// `~` Tilde, - /// "?" + /// `?` Question, - /// ":" + /// `:` Colon, - /// "$" + /// `$` Dollar, - /// "=" + /// `=` Eq, - /// "!" + /// `!` Bang, - /// "<" + /// `<` Lt, - /// ">" + /// `>` Gt, - /// "-" + /// `-` Minus, - /// "&" + /// `&` And, - /// "|" + /// `|` Or, - /// "+" + /// `+` Plus, - /// "*" + /// `*` Star, - /// "/" + /// `/` Slash, - /// "^" + /// `^` Caret, - /// "%" + /// `%` Percent, /// Unknown token, not expected by the lexer, e.g. "№" @@ -468,7 +462,7 @@ impl Cursor<'_> { Literal { kind, suffix_start } } // Identifier starting with an emoji. Only lexed for graceful error recovery. - c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Unknown, }; let res = Token::new(token_kind, self.pos_within_token()); @@ -552,24 +546,22 @@ impl Cursor<'_> { // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, - c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(), + c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Ident, } } - fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind { + fn invalid_ident(&mut self) -> TokenKind { // Start is already eaten, eat the rest of identifier. self.eat_while(|c| { - unicode_xid::UnicodeXID::is_xid_continue(c) - || (!c.is_ascii() && c.is_emoji_char()) - || c == '\u{200d}' + const ZERO_WIDTH_JOINER: char = '\u{200d}'; + is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER }); - // Known prefixes must have been handled earlier. So if - // we see a prefix here, it is definitely an unknown prefix. - match self.first() { - '#' | '"' | '\'' => InvalidPrefix, - _ => InvalidIdent, - } + // An invalid identifier followed by '#' or '"' or '\'' could be + // interpreted as an invalid literal prefix. We don't bother doing that + // because the treatment of invalid identifiers and invalid prefixes + // would be the same. + InvalidIdent } fn c_or_byte_string( diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 226de65445c..5023e83bd67 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { let ident = Symbol::intern(lifetime_name); token::Lifetime(ident, IdentIsRaw::No) } - rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix + rustc_lexer::TokenKind::InvalidIdent // Do not recover an identifier with emoji if the codepoint is a confusable // with a recoverable substitution token, like `➖`. if !UNICODE_ARRAY.iter().any(|&(c, _, _)| { @@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), rustc_lexer::TokenKind::Unknown - | rustc_lexer::TokenKind::InvalidIdent - | rustc_lexer::TokenKind::InvalidPrefix => { + | rustc_lexer::TokenKind::InvalidIdent => { // Don't emit diagnostics for sequences of the same invalid token if swallow_next_invalid > 0 { swallow_next_invalid -= 1; diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 4def80764ea..29f6f92a6b2 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -861,10 +861,9 @@ impl<'src> Classifier<'src> { }, Some(c) => c, }, - TokenKind::RawIdent - | TokenKind::UnknownPrefix - | TokenKind::InvalidPrefix - | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)), + TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { + Class::Ident(self.new_span(before, text)) + } TokenKind::Lifetime { .. } | TokenKind::RawLifetime | TokenKind::UnknownPrefixLifetime => Class::Lifetime, diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index 3c0eb1b42a6..c97596d5097 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -183,7 +183,7 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::Ident => { SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT) } - rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => { + rustc_lexer::TokenKind::InvalidIdent => { err = "Ident contains invalid characters"; IDENT }