cpython/Parser/tokenizer/string_tokenizer.c

#include "Python.h"
#include "errcode.h"

#include "helpers.h"
#include "../lexer/state.h"

static int
tok_underflow_string(struct tok_state *tok) {
    char *end = strchr(tok->inp, '\n');
    if (end != NULL) {
        end++;
    }
    else {
        end = strchr(tok->inp, '\0');
        if (end == tok->inp) {
            tok->done = E_EOF;
            return 0;
        }
    }
    if (tok->start == NULL) {
        tok->buf = tok->cur;
    }
    tok->line_start = tok->cur;
    ADVANCE_LINENO();
    tok->inp = end;
    return 1;
}

/* Fetch a byte from TOK, using the string buffer. */
static int
buf_getc(struct tok_state *tok) {
    return Py_CHARMASK(*tok->str++);
}

/* Unfetch a byte from TOK, using the string buffer. */
static void
buf_ungetc(int c, struct tok_state *tok) {
    tok->str--;
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
}

/* Set the readline function for TOK to ENC. For the string-based
   tokenizer, this means to just record the encoding. */
static int
buf_setreadl(struct tok_state *tok, const char* enc) {
    tok->enc = enc;
    return 1;
}

/* Decode a byte string STR for use as the buffer of TOK.
   Look for encoding declarations inside STR, and record them
   inside TOK.  */
static char *
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
    PyObject* utf8 = NULL;
    char *str;
    const char *s;
    const char *newl[2] = {NULL, NULL};
    int lineno = 0;
    tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
    if (str == NULL)
        return NULL;
    tok->enc = NULL;
    tok->str = str;
    if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
        return _PyTokenizer_error_ret(tok);
    str = tok->str;             /* string after BOM if any */
    assert(str);
    if (tok->enc != NULL) {
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
        if (utf8 == NULL)
            return _PyTokenizer_error_ret(tok);
        str = PyBytes_AsString(utf8);
    }
    for (s = str;; s++) {
        if (*s == '\0') break;
        else if (*s == '\n') {
            assert(lineno < 2);
            newl[lineno] = s;
            lineno++;
            if (lineno == 2) break;
        }
    }
    tok->enc = NULL;
    /* need to check line 1 and 2 separately since check_coding_spec
       assumes a single line as input */
    if (newl[0]) {
        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
            return NULL;
        }
        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                   tok, buf_setreadl))
                return NULL;
        }
    }
    if (tok->enc != NULL) {
        assert(utf8 == NULL);
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
        if (utf8 == NULL)
            return _PyTokenizer_error_ret(tok);
        str = PyBytes_AS_STRING(utf8);
    }
    assert(tok->decoding_buffer == NULL);
    tok->decoding_buffer = utf8; /* CAUTION */
    return str;
}

/* Set up tokenizer for string */
struct tok_state *
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
    struct tok_state *tok = _PyTokenizer_tok_new();
    char *decoded;

    if (tok == NULL)
        return NULL;
    decoded = decode_str(str, exec_input, tok, preserve_crlf);
    if (decoded == NULL) {
        _PyTokenizer_Free(tok);
        return NULL;
    }

    tok->buf = tok->cur = tok->inp = decoded;
    tok->end = decoded;
    tok->underflow = &tok_underflow_string;
    return tok;
}
gh-104169: Refactor tokenizer into lexer and wrappers (#110684) * The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> 2023-10-11 17:14:44 +02:00			`#include "Python.h"`
			`#include "errcode.h"`

			`#include "helpers.h"`
			`#include "../lexer/state.h"`

			`static int`
			`tok_underflow_string(struct tok_state *tok) {`
			`char *end = strchr(tok->inp, '\n');`
			`if (end != NULL) {`
			`end++;`
			`}`
			`else {`
			`end = strchr(tok->inp, '\0');`
			`if (end == tok->inp) {`
			`tok->done = E_EOF;`
			`return 0;`
			`}`
			`}`
			`if (tok->start == NULL) {`
			`tok->buf = tok->cur;`
			`}`
			`tok->line_start = tok->cur;`
			`ADVANCE_LINENO();`
			`tok->inp = end;`
			`return 1;`
			`}`

			`/* Fetch a byte from TOK, using the string buffer. */`
			`static int`
			`buf_getc(struct tok_state *tok) {`
			`return Py_CHARMASK(*tok->str++);`
			`}`

			`/* Unfetch a byte from TOK, using the string buffer. */`
			`static void`
			`buf_ungetc(int c, struct tok_state *tok) {`
			`tok->str--;`
			`assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */`
			`}`

			`/* Set the readline function for TOK to ENC. For the string-based`
			`tokenizer, this means to just record the encoding. */`
			`static int`
			`buf_setreadl(struct tok_state tok, const char enc) {`
			`tok->enc = enc;`
			`return 1;`
			`}`

			`/* Decode a byte string STR for use as the buffer of TOK.`
			`Look for encoding declarations inside STR, and record them`
			`inside TOK. */`
			`static char *`
			`decode_str(const char input, int single, struct tok_state tok, int preserve_crlf)`
			`{`
			`PyObject* utf8 = NULL;`
			`char *str;`
			`const char *s;`
			`const char *newl[2] = {NULL, NULL};`
			`int lineno = 0;`
			`tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);`
			`if (str == NULL)`
			`return NULL;`
			`tok->enc = NULL;`
			`tok->str = str;`
			`if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))`
			`return _PyTokenizer_error_ret(tok);`
			`str = tok->str; /* string after BOM if any */`
			`assert(str);`
			`if (tok->enc != NULL) {`
			`utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);`
			`if (utf8 == NULL)`
			`return _PyTokenizer_error_ret(tok);`
			`str = PyBytes_AsString(utf8);`
			`}`
			`for (s = str;; s++) {`
			`if (*s == '\0') break;`
			`else if (*s == '\n') {`
			`assert(lineno < 2);`
			`newl[lineno] = s;`
			`lineno++;`
			`if (lineno == 2) break;`
			`}`
			`}`
			`tok->enc = NULL;`
			`/* need to check line 1 and 2 separately since check_coding_spec`
			`assumes a single line as input */`
			`if (newl[0]) {`
			`if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {`
			`return NULL;`
			`}`
			`if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {`
			`if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],`
			`tok, buf_setreadl))`
			`return NULL;`
			`}`
			`}`
			`if (tok->enc != NULL) {`
			`assert(utf8 == NULL);`
			`utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);`
			`if (utf8 == NULL)`
			`return _PyTokenizer_error_ret(tok);`
			`str = PyBytes_AS_STRING(utf8);`
			`}`
			`assert(tok->decoding_buffer == NULL);`
			`tok->decoding_buffer = utf8; /* CAUTION */`
			`return str;`
			`}`

			`/* Set up tokenizer for string */`
			`struct tok_state *`
			`_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)`
			`{`
			`struct tok_state *tok = _PyTokenizer_tok_new();`
			`char *decoded;`

			`if (tok == NULL)`
			`return NULL;`
			`decoded = decode_str(str, exec_input, tok, preserve_crlf);`
			`if (decoded == NULL) {`
			`_PyTokenizer_Free(tok);`
			`return NULL;`
			`}`

			`tok->buf = tok->cur = tok->inp = decoded;`
			`tok->end = decoded;`
			`tok->underflow = &tok_underflow_string;`
			`return tok;`
			`}`