From df29f9b0c3a2277c2eb2de2e0bf4f4b3329dbd00 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 19 Nov 2024 14:48:48 +1100
Subject: [PATCH 1/3] Improve `fake_ident_or_unknown_prefix`.

- Rename it as `invalid_ident_or_prefix`, which matches the possible
  outputs (`InvalidIdent` or `InvalidPrefix`).
- Use the local wrapper for `is_xid_continue`, for consistency.
- Make it clear what `\u{200d}` means.
---
 compiler/rustc_lexer/src/lib.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index f9f2a14dbd2..7f221098364 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -468,7 +468,7 @@ impl Cursor<'_> {
                 Literal { kind, suffix_start }
             }
             // Identifier starting with an emoji. Only lexed for graceful error recovery.
-            c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
+            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
             _ => Unknown,
         };
         let res = Token::new(token_kind, self.pos_within_token());
@@ -552,17 +552,16 @@ impl Cursor<'_> {
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
-            c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
+            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
             _ => Ident,
         }
     }
 
-    fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
+    fn invalid_ident_or_prefix(&mut self) -> TokenKind {
         // Start is already eaten, eat the rest of identifier.
         self.eat_while(|c| {
-            unicode_xid::UnicodeXID::is_xid_continue(c)
-                || (!c.is_ascii() && c.is_emoji_char())
-                || c == '\u{200d}'
+            const ZERO_WIDTH_JOINER: char = '\u{200d}';
+            is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
         });
         // Known prefixes must have been handled earlier. So if
         // we see a prefix here, it is definitely an unknown prefix.

From 2c7c3697dbaac3e0599aa0e7cd3581822caec17b Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 19 Nov 2024 15:11:18 +1100
Subject: [PATCH 2/3] Improve `TokenKind` comments.

- Improve wording.
- Use backticks consistently for examples.
---
 compiler/rustc_lexer/src/lib.rs | 80 ++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 41 deletions(-)

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index 7f221098364..c01dad810c4 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -57,11 +57,10 @@ impl Token {
 /// Enum representing common lexeme types.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum TokenKind {
-    // Multi-char tokens:
-    /// "// comment"
+    /// A line comment, e.g. `// comment`.
     LineComment { doc_style: Option<DocStyle> },
 
-    /// `/* block comment */`
+    /// A block comment, e.g. `/* block comment */`.
     ///
     /// Block comments can be recursive, so a sequence like `/* /* */`
     /// will not be considered terminated and will result in a parsing error.
@@ -70,18 +69,17 @@ pub enum TokenKind {
     /// Any whitespace character sequence.
     Whitespace,
 
-    /// "ident" or "continue"
-    ///
-    /// At this step, keywords are also considered identifiers.
+    /// An identifier or keyword, e.g. `ident` or `continue`.
     Ident,
 
-    /// Like the above, but containing invalid unicode codepoints.
+    /// An identifier that is invalid because it contains emoji.
     InvalidIdent,
 
-    /// "r#ident"
+    /// A raw identifier, e.g. "r#ident".
     RawIdent,
 
-    /// An unknown prefix, like `foo#`, `foo'`, `foo"`.
+    /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
+    /// literal prefixes that contain emoji, which are considered "invalid".
     ///
     /// Note that only the
     /// prefix (`foo`) is included in the token, not the separator (which is
@@ -93,11 +91,12 @@ pub enum TokenKind {
 
     /// An unknown prefix in a lifetime, like `'foo#`.
     ///
-    /// Note that like above, only the `'` and prefix are included in the token
+    /// Like `UnknownPrefix`, only the `'` and prefix are included in the token
     /// and not the separator.
     UnknownPrefixLifetime,
 
-    /// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`.
+    /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
+    /// several tokens: `'r` and `#` and `foo`.
     RawLifetime,
 
     /// Similar to the above, but *always* an error on every edition. This is used
@@ -110,70 +109,69 @@ pub enum TokenKind {
     /// Split into the component tokens on older editions.
     GuardedStrPrefix,
 
-    /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
+    /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
     /// suffix, but may be present here on string and float literals. Users of
     /// this type will need to check for and reject that case.
     ///
     /// See [LiteralKind] for more details.
     Literal { kind: LiteralKind, suffix_start: u32 },
 
-    /// "'a"
+    /// A lifetime, e.g. `'a`.
     Lifetime { starts_with_number: bool },
 
-    // One-char tokens:
-    /// ";"
+    /// `;`
     Semi,
-    /// ","
+    /// `,`
     Comma,
-    /// "."
+    /// `.`
     Dot,
-    /// "("
+    /// `(`
     OpenParen,
-    /// ")"
+    /// `)`
     CloseParen,
-    /// "{"
+    /// `{`
     OpenBrace,
-    /// "}"
+    /// `}`
     CloseBrace,
-    /// "["
+    /// `[`
     OpenBracket,
-    /// "]"
+    /// `]`
     CloseBracket,
-    /// "@"
+    /// `@`
     At,
-    /// "#"
+    /// `#`
     Pound,
-    /// "~"
+    /// `~`
     Tilde,
-    /// "?"
+    /// `?`
     Question,
-    /// ":"
+    /// `:`
     Colon,
-    /// "$"
+    /// `$`
     Dollar,
-    /// "="
+    /// `=`
     Eq,
-    /// "!"
+    /// `!`
     Bang,
-    /// "<"
+    /// `<`
     Lt,
-    /// ">"
+    /// `>`
     Gt,
-    /// "-"
+    /// `-`
     Minus,
-    /// "&"
+    /// `&`
     And,
-    /// "|"
+    /// `|`
     Or,
-    /// "+"
+    /// `+`
     Plus,
-    /// "*"
+    /// `*`
     Star,
-    /// "/"
+    /// `/`
     Slash,
-    /// "^"
+    /// `^`
     Caret,
-    /// "%"
+    /// `%`
     Percent,
 
     /// Unknown token, not expected by the lexer, e.g. "№"

From e9a0c3c98c5640070e15a3cb38860a7268c1dca2 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 19 Nov 2024 15:55:34 +1100
Subject: [PATCH 3/3] Remove `TokenKind::InvalidPrefix`.

It was added in #123752 to handle some cases involving emoji, but it
isn't necessary because it's always treated the same as
`TokenKind::InvalidIdent`. This commit removes it, which makes things a
little simpler.
---
 compiler/rustc_lexer/src/lib.rs               | 21 +++++++------------
 compiler/rustc_parse/src/lexer/mod.rs         |  5 ++---
 src/librustdoc/html/highlight.rs              |  7 +++----
 .../crates/parser/src/lexed_str.rs            |  2 +-
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index c01dad810c4..bcb103957ba 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -99,10 +99,6 @@ pub enum TokenKind {
     /// several tokens: `'r` and `#` and `foo`.
     RawLifetime,
 
-    /// Similar to the above, but *always* an error on every edition. This is used
-    /// for emoji identifier recovery, as those are not meant to be ever accepted.
-    InvalidPrefix,
-
     /// Guarded string literal prefix: `#"` or `##`.
     ///
     /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
@@ -466,7 +462,7 @@ impl Cursor<'_> {
                 Literal { kind, suffix_start }
             }
             // Identifier starting with an emoji. Only lexed for graceful error recovery.
-            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
+            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
             _ => Unknown,
         };
         let res = Token::new(token_kind, self.pos_within_token());
@@ -550,23 +546,22 @@ impl Cursor<'_> {
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
-            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
+            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
             _ => Ident,
         }
     }
 
-    fn invalid_ident_or_prefix(&mut self) -> TokenKind {
+    fn invalid_ident(&mut self) -> TokenKind {
         // Start is already eaten, eat the rest of identifier.
         self.eat_while(|c| {
             const ZERO_WIDTH_JOINER: char = '\u{200d}';
             is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
         });
-        // Known prefixes must have been handled earlier. So if
-        // we see a prefix here, it is definitely an unknown prefix.
-        match self.first() {
-            '#' | '"' | '\'' => InvalidPrefix,
-            _ => InvalidIdent,
-        }
+        // An invalid identifier followed by '#' or '"' or '\'' could be
+        // interpreted as an invalid literal prefix. We don't bother doing that
+        // because the treatment of invalid identifiers and invalid prefixes
+        // would be the same.
+        InvalidIdent
     }
 
     fn c_or_byte_string(
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 226de65445c..5023e83bd67 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     let ident = Symbol::intern(lifetime_name);
                     token::Lifetime(ident, IdentIsRaw::No)
                 }
-                rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix
+                rustc_lexer::TokenKind::InvalidIdent
                     // Do not recover an identifier with emoji if the codepoint is a confusable
                     // with a recoverable substitution token, like `➖`.
                     if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
@@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
                 rustc_lexer::TokenKind::Unknown
-                | rustc_lexer::TokenKind::InvalidIdent
-                | rustc_lexer::TokenKind::InvalidPrefix => {
+                | rustc_lexer::TokenKind::InvalidIdent => {
                     // Don't emit diagnostics for sequences of the same invalid token
                     if swallow_next_invalid > 0 {
                         swallow_next_invalid -= 1;
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index 4def80764ea..29f6f92a6b2 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -861,10 +861,9 @@ impl<'src> Classifier<'src> {
                 },
                 Some(c) => c,
             },
-            TokenKind::RawIdent
-            | TokenKind::UnknownPrefix
-            | TokenKind::InvalidPrefix
-            | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
+            TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
+                Class::Ident(self.new_span(before, text))
+            }
             TokenKind::Lifetime { .. }
             | TokenKind::RawLifetime
             | TokenKind::UnknownPrefixLifetime => Class::Lifetime,
diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
index 3c0eb1b42a6..c97596d5097 100644
--- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
+++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -183,7 +183,7 @@ impl<'a> Converter<'a> {
                 rustc_lexer::TokenKind::Ident => {
                     SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
                 }
-                rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
+                rustc_lexer::TokenKind::InvalidIdent => {
                     err = "Ident contains invalid characters";
                     IDENT
                 }