Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Speed up parsing by making `Source` index-based: it now tracks an integer position over the input string instead of materializing a list of `(index, char)` tuples up front, so construction is O(1) and state save/restore no longer copies an iterator. ([#489](https://github.com/python-poetry/tomlkit/pull/489))
- Speed up parsing by scanning character runs in bulk: `Source.advance_while`/`advance_until` consume a whole run of whitespace, bare-key or number characters in a single pass over the input string instead of one `inc()` call per character. ([#490](https://github.com/python-poetry/tomlkit/pull/490))
- Speed up parsing of single-line strings by bulk-appending the run of ordinary characters up to the next delimiter, backslash or control character in one pass, instead of one character at a time. ([#491](https://github.com/python-poetry/tomlkit/pull/491))
- Speed up parsing by removing the internal `TOMLChar` wrapper: the parser now reads plain `str` characters from `Source` and detects end-of-input positionally, avoiding a per-character object construction and method dispatch. ([#492](https://github.com/python-poetry/tomlkit/pull/492))

### Fixed

Expand Down
39 changes: 24 additions & 15 deletions tomlkit/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
from tomlkit.items import Whitespace
from tomlkit.source import Source
from tomlkit.source import _StateHandler
from tomlkit.toml_char import TOMLChar
from tomlkit.toml_document import TOMLDocument


Expand All @@ -57,11 +56,22 @@
CTRL_CHAR_LIMIT = 0x1F
CHR_DEL = 0x7F

# TOML character classes (formerly the `TOMLChar` constants). The parser works on
# plain 1-char `str`s read from `Source`; membership tests use these frozensets.
_SPACES = " \t"
_NL = "\n\r"
_WS = _SPACES + _NL
_BARE = string.ascii_letters + string.digits + "-_"
_KV = "= \t"
Comment on lines +61 to +65

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we only keep the *_SET variants?All __contains__ check should work, too.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or it's best to rename them without the _SET suffix.


_SPACES_SET = frozenset(_SPACES)
_NL_SET = frozenset(_NL)
_WS_SET = frozenset(_WS)
_KV_SET = frozenset(_KV)
# Character sets for Source.advance_while / advance_until bulk run scans
# (replace per-character `while self._current.is_*() and self.inc()` loops with
# (replace per-character `while self._current in <set> and self.inc()` loops with
# a single underlying-string scan).
_SPACES_SET = frozenset(TOMLChar.SPACES)
_BARE_KEY_OR_SPACE = frozenset(TOMLChar.BARE + TOMLChar.SPACES)
_BARE_KEY_OR_SPACE = frozenset(_BARE + _SPACES)
_NUM_STOP = frozenset(" \t\n\r#,]}")
_DATE_TAIL_STOP = frozenset("\t\n\r#,]}")
# Control chars invalid inside a single-line string (DEL + everything <= 0x1F
Expand Down Expand Up @@ -94,7 +104,7 @@ def _idx(self) -> int:
return self._src.idx

@property
def _current(self) -> TOMLChar:
def _current(self) -> str:
return self._src.current

@property
Expand Down Expand Up @@ -292,7 +302,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]
self.inc() # Skip #

# The comment itself
while not self.end() and not self._current.is_nl():
while not self.end() and self._current not in _NL_SET:
code = ord(self._current)
if code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I):
raise self.parse_error(InvalidControlChar, code, "comments")
Expand Down Expand Up @@ -331,7 +341,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]
if self._current == "\n":
self.inc()

if self._idx != self._marker or self._current.is_ws():
if self._idx != self._marker or self._current in _WS_SET:
trail = self.extract()

return comment_ws, comment, trail
Expand All @@ -350,7 +360,7 @@ def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]:
self.mark()

found_equals = self._current == "="
while self._current.is_kv_sep() and self.inc():
while self._current in _KV_SET and self.inc():
if self._current == "=":
if found_equals:
raise self.parse_error(UnexpectedCharError, "=")
Expand Down Expand Up @@ -594,9 +604,9 @@ def _parse_array(self) -> Array:
while True:
# consume whitespace
mark = self._idx
self.consume(TOMLChar.SPACES + TOMLChar.NL)
self.consume(_WS)
indent = self._src[mark : self._idx]
newline = set(TOMLChar.NL) & set(indent)
newline = _NL_SET & set(indent)
if newline:
elems.append(Whitespace(indent))
continue
Expand Down Expand Up @@ -659,7 +669,7 @@ def _parse_inline_table(self) -> InlineTable:
while True:
# consume whitespace and newlines
mark = self._idx
self.consume(TOMLChar.SPACES + TOMLChar.NL)
self.consume(_WS)
raw = self._src[mark : self._idx]
if raw:
elems.add(Whitespace(raw))
Expand Down Expand Up @@ -749,7 +759,7 @@ def _parse_basic_string(self) -> String:
return self._parse_string(StringType.SLB)

def _parse_escaped_char(self, multiline: bool) -> str:
if multiline and self._current.is_ws():
if multiline and self._current in _WS_SET:
# When the last non-whitespace character on a line is
# a \, it will be trimmed along with all whitespace
# (including newlines) up to the next non-whitespace
Expand All @@ -758,7 +768,7 @@ def _parse_escaped_char(self, multiline: bool) -> str:
# hello \
# world"""
tmp = ""
while self._current.is_ws():
while self._current in _WS_SET:
tmp += self._current
# consume the whitespace, EOF here is an issue
# (middle of string)
Expand Down Expand Up @@ -847,7 +857,6 @@ def _parse_string(self, delim: StringType) -> String:
# PERF: stop-set for the single-line string-body bulk fast-path (None for
# multiline, which keeps the per-char loop because of \r\n handling).
src = self._src
EOF = src.EOF
single_stop = None
if delim.is_singleline():
single_stop = (
Expand Down Expand Up @@ -938,7 +947,7 @@ def _parse_string(self, delim: StringType) -> String:
# loop for CRLF handling).
run_start = src._idx
src.advance_until(single_stop)
if src._current is EOF:
if src.end():
# mid-string EOF — same error as the per-char inc()
raise self.parse_error(UnexpectedEofError)
value += src[run_start : src._idx]
Expand Down
30 changes: 14 additions & 16 deletions tomlkit/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from tomlkit.exceptions import ParseError
from tomlkit.exceptions import UnexpectedCharError
from tomlkit.toml_char import TOMLChar


class _State:
Expand Down Expand Up @@ -75,19 +74,20 @@ def __exit__(


class Source(str):
EOF = TOMLChar("\0")
# EOF is a placeholder value for `current` past the end of input. End-of-input
# is detected positionally (`end()` / `_idx >= len`), never by comparing to this
# value, so a real NUL byte in the input is not mistaken for EOF.
EOF = "\0"

def __init__(self, _: str) -> None:
super().__init__()

# PERF: previously built `iter([(i, TOMLChar(c)) for i, c in enumerate(self)])`
# which materialized N tuples + N TOMLChars at init time (~584 k allocations
# per 150-parse benchmark). Switching to an integer index over the underlying
# str makes init O(1) and lets `inc()` just bump the index and slice the str.
# The TOMLChar cache (toml_char.py) absorbs the per-character cost.
# Track an integer index over the underlying str (Source subclasses str):
# init is O(1) and `inc()` just bumps the index and reads the next char,
# instead of materializing a list of (index, char) pairs up front.
self._idx = -1 # pre-start sentinel; first inc() will land on 0
self._marker = 0
self._current: TOMLChar = TOMLChar("")
self._current: str = ""

self._state = _StateHandler(self)

Expand All @@ -109,7 +109,7 @@ def idx(self) -> int:
return self._idx

@property
def current(self) -> TOMLChar:
def current(self) -> str:
return self._current

@property
Expand All @@ -127,13 +127,11 @@ def inc(self, exception: type[ParseError] | None = None) -> bool:
Increments the parser if the end of the input has not been reached.
Returns whether or not it was able to advance.
"""
# PERF: integer increment + cached TOMLChar lookup, no iterator/next()/
# StopIteration triage. After the first char of each kind has been seen,
# `TOMLChar(self[i])` is a dict.get cache hit.
# Integer increment + a single str index, no iterator / StopIteration triage.
next_idx = self._idx + 1
if next_idx < len(self):
self._idx = next_idx
self._current = TOMLChar(self[next_idx])
self._current = self[next_idx]
return True

# Past end : pin to len, switch current to EOF, raise if asked.
Expand All @@ -159,7 +157,7 @@ def advance_while(self, charset: frozenset) -> bool:
i += 1
if i < n:
self._idx = i
self._current = TOMLChar(self[i])
self._current = self[i]
return True
self._idx = n
self._current = self.EOF
Expand All @@ -179,7 +177,7 @@ def advance_until(self, stopset: frozenset) -> bool:
i += 1
if i < n:
self._idx = i
self._current = TOMLChar(self[i])
self._current = self[i]
return True
self._idx = n
self._current = self.EOF
Expand Down Expand Up @@ -210,7 +208,7 @@ def end(self) -> bool:
"""
Returns True if the parser has reached the end of the input.
"""
return self._current is self.EOF
return self._idx >= len(self)

def mark(self) -> None:
"""
Expand Down
52 changes: 0 additions & 52 deletions tomlkit/toml_char.py

This file was deleted.

Loading