0
0
mirror of https://github.com/python/cpython.git synced 2024-11-27 23:47:29 +01:00
cpython/Parser/tokenizer/utf8_tokenizer.c
Lysandros Nikolaou 01481f2dc1
gh-104169: Refactor tokenizer into lexer and wrappers (#110684)
* The lexer, which include the actual lexeme producing logic, goes into
  the `lexer` directory.
* The wrappers, one wrapper per input mode (file, string, utf-8, and
  readline), go into the `tokenizer` directory and include logic for
  creating a lexer instance and managing the buffer for different modes.
---------

Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
2023-10-11 15:14:44 +00:00

56 lines
1.3 KiB
C

#include "Python.h"
#include "errcode.h"
#include "helpers.h"
#include "../lexer/state.h"
static int
tok_underflow_string(struct tok_state *tok) {
char *end = strchr(tok->inp, '\n');
if (end != NULL) {
end++;
}
else {
end = strchr(tok->inp, '\0');
if (end == tok->inp) {
tok->done = E_EOF;
return 0;
}
}
if (tok->start == NULL) {
tok->buf = tok->cur;
}
tok->line_start = tok->cur;
ADVANCE_LINENO();
tok->inp = end;
return 1;
}
/* Set up tokenizer for UTF-8 string */
struct tok_state *
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = _PyTokenizer_tok_new();
char *translated;
if (tok == NULL)
return NULL;
tok->input = translated = _PyTokenizer_translate_newlines(str, exec_input, preserve_crlf, tok);
if (translated == NULL) {
_PyTokenizer_Free(tok);
return NULL;
}
tok->decoding_state = STATE_NORMAL;
tok->enc = NULL;
tok->str = translated;
tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok);
if (!tok->encoding) {
_PyTokenizer_Free(tok);
return NULL;
}
tok->buf = tok->cur = tok->inp = translated;
tok->end = translated;
tok->underflow = &tok_underflow_string;
return tok;
}