diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index bd6a462..644bac8 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -17,9 +17,20 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install textparser development and testing dependencies + run: | + pip install --upgrade pip + pip install . + pip install .[test] - name: Test run: | python -m unittest + - name: Static type checking (mypy) + run: | + python -m mypy --strict textparser tests + - name: Linting (ruff) + run: | + ruff check textparser tests release: needs: [test] diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 7d80955..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include LICENSE -include Makefile -recursive-include tests *.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3ee9f5f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,90 @@ +[build-system] +requires = ["setuptools>=68", "setuptools_scm>=8"] +build-backend = "setuptools.build_meta" + +[project] +name = "textparser" +authors = [ + {name = "Erik Moqvist"}, +] +description = "A text parser library for python." +readme = "README.rst" +requires-python = ">=3.10" +keywords = ['parser', 'parsing'] +license = "MIT" +dependencies = [ +] +dynamic = ["version"] + +[tool.setuptools_scm] +write_to = "textparser/version.py" + +[project.urls] +homepage = "https://github.com/cantools/textparser" +documentation = "https://textparser.readthedocs.io/" +repository = "https://github.com/cantools/textparser" + +[project.optional-dependencies] +examples = [ + "lark-parser", + "pyparsing", + "parsita", + "funcparserlib", + "parsy", + "parsimonious", + "textx", +] +test = [ + "mypy >= 2.1", + "ruff >= 0.15.16", + "pytest >= 9.0", + "coverage >= 7.14", +] + +[tool.setuptools] +packages = ["textparser"] + +[tool.mypy] +show_error_codes = true +warn_return_any = true +warn_unused_configs = true +no_implicit_optional = true +disallow_incomplete_defs = true +warn_redundant_casts = true +warn_unused_ignores = true + +exclude = [ + 'build', + 'textparser/version.py', +] + +[tool.ruff] +line-length = 80 +lint.extend-select = [ + "A", # flake8-builtins + "B", # pyflakes-bugbear + "C4", # flake8-comprehensions + "E", # pycodestyle Error + "F", # pyflakes + "FURB", # refurb + "I", # isort + "PIE", # flake8-pie + "PL", # pylint + "RUF", # Ruff-specific rules + "TCH", # flake8-type-checking + "UP", # pyupgrade + "W", # pycodestyle Warning +] +lint.ignore = [ + "E501", # line too long + "F541", # f-string-missing-placeholders + "PLR09", # too-many-this, too-many-that + "PLR2004", # magic-value-comparison + "PLW1641", # eq-without-hash + "PLW2901", # redefined-loop-name + "RUF012", # mutable-class-default +] +lint.isort.known-first-party = ["textparser"] +exclude = [ + "textparser/version.py", +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index be7551c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -lark-parser -pyparsing -parsita -funcparserlib -parsy -parsimonious -textx diff --git a/setup.py b/setup.py deleted file mode 100755 index e7ed7a5..0000000 --- a/setup.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -from setuptools import setup -from setuptools import find_packages -import re - - -def find_version(): - return re.search(r"^__version__ = '(.*)'$", - open('textparser.py', 'r').read(), - re.MULTILINE).group(1) - - -setup(name='textparser', - version=find_version(), - description='Text parser.', - long_description=open('README.rst', 'r').read(), - author='Erik Moqvist', - author_email='erik.moqvist@gmail.com', - license='MIT', - classifiers=[ - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3', - ], - keywords=['parser', 'parsing'], - url='https://github.com/eerimoq/textparser', - py_modules=['textparser'], - python_requires='>=3.10', - test_suite="tests") diff --git a/tests/test_textparser.py b/tests/test_textparser.py index 0af6c8b..d5031b9 100644 --- a/tests/test_textparser.py +++ b/tests/test_textparser.py @@ -1,34 +1,53 @@ +import collections import pickle import unittest from collections import namedtuple +from typing import cast import textparser -from textparser import Grammar -from textparser import Sequence -from textparser import Choice -from textparser import choice -from textparser import ChoiceDict -from textparser import ZeroOrMore -from textparser import ZeroOrMoreDict -from textparser import OneOrMore -from textparser import OneOrMoreDict -from textparser import DelimitedList -from textparser import Token -from textparser import TokenizeError -from textparser import tokenize_init -from textparser import Any -from textparser import AnyUntil -from textparser import Optional -from textparser import Tag -from textparser import Forward -from textparser import NoMatch -from textparser import Not -from textparser import And -from textparser import markup_line -from textparser import replace_blocks - - -def tokenize(items, add_eof_token=True): +from textparser import ( + And, + Any, + AnyUntil, + Choice, + ChoiceDict, + DelimitedList, + Forward, + Grammar, + MatchObject, + MismatchSingleton, + NoMatch, + Not, + OneOrMore, + OneOrMoreDict, + Optional, + Pattern, + Sequence, + Tag, + Token, + TokenizeError, + ZeroOrMore, + ZeroOrMoreDict, + _Tokens, + choice, + markup_line, + replace_blocks, + tokenize_init, +) + +# list of tuples containing the arguments for the Token class. Used to +# create a list of Token objects. +TokenizeItems = list[tuple[str,str]|tuple[str,str,int]] + +# Specify the tree of tokens and the expected match result for a given +# grammar +GrammarMatchSpec = tuple[TokenizeItems, MatchObject] + +# Specify the tree of tokens and the line number where the grammar +# is supposed to not match the token tree +GrammarMismatchSpec = tuple[TokenizeItems, int] + +def tokenize(items: TokenizeItems, add_eof_token: bool=True) -> list[Token]: tokens = [] for item in items: @@ -47,30 +66,30 @@ def tokenize(items, add_eof_token=True): class TextParserTest(unittest.TestCase): - def parse_and_assert_tree(self, grammar, datas): - for tokens, expected_tree in datas: - tree = grammar.parse(tokenize(tokens)) - self.assertEqual(tree, expected_tree) + def parse_and_assert_tree(self, grammar: Grammar, test_specs: list[GrammarMatchSpec]) -> None: + for token_items, expected_tree in test_specs: + token_tree = grammar.parse(tokenize(token_items)) + self.assertEqual(token_tree, expected_tree) - def parse_and_assert_mismatch(self, grammar, datas): - for tokens, line in datas: - tokens = tokenize(tokens) + def parse_and_assert_mismatch(self, grammar: Grammar, test_specs: list[GrammarMismatchSpec]) -> None: + for token_items, line in test_specs: + token_tree = tokenize(token_items) with self.assertRaises(textparser.GrammarError) as cm: - grammar.parse(tokens) + grammar.parse(token_tree) self.assertEqual(cm.exception.offset, line) - def test_grammar_sequence(self): + def test_grammar_sequence(self) -> None: grammar = Grammar(Sequence('NUMBER', 'WORD')) tokens = tokenize([ ('NUMBER', '1.45'), ('WORD', 'm') ]) - tree = grammar.parse(tokens) - self.assertEqual(tree, ['1.45', 'm']) + match_object = grammar.parse(tokens) + self.assertEqual(match_object, ['1.45', 'm']) - def test_grammar_sequence_mismatch(self): + def test_grammar_sequence_mismatch(self) -> None: grammar = Grammar(Sequence('NUMBER', 'WORD')) tokens = tokenize([('NUMBER', '1.45')]) @@ -79,10 +98,10 @@ def test_grammar_sequence_mismatch(self): self.assertEqual(cm.exception.offset, -1) - def test_grammar_choice(self): + def test_grammar_choice(self) -> None: grammar = Grammar(Choice('NUMBER', 'WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'm')], 'm' @@ -95,18 +114,18 @@ def test_grammar_choice(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_choice_mismatch(self): + def test_grammar_choice_mismatch(self) -> None: grammar = Grammar(Choice(Sequence('NUMBER', 'WORD'), 'WORD')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ([('NUMBER', '1', 5)], -1), ([('NUMBER', '1', 5), ('NUMBER', '2', 7)], 7) ] self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_choice_dict(self): + def test_grammar_choice_dict(self) -> None: number = Forward() number <<= Sequence('NUMBER') grammar = Grammar(ChoiceDict(number, @@ -114,10 +133,13 @@ def test_grammar_choice_dict(self): ChoiceDict('BAR'), 'FIE')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'm')], - ('foo', ['m']) + # the cast is necessary because mypy does not + # recognize (str, MatchObject) tuples as MatchObject, + # even though it should... + cast('MatchObject', ('foo', ['m'])) ), ( [('NUMBER', '5')], @@ -135,18 +157,18 @@ def test_grammar_choice_dict(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_choice_dict_mismatch(self): + def test_grammar_choice_dict_mismatch(self) -> None: grammar = Grammar(ChoiceDict(Sequence('NUMBER'), Sequence('WORD'))) tokens = tokenize([(',', ',', 3)]) - with self.assertRaises(textparser.Error) as cm: + with self.assertRaises(textparser.GrammarError) as cm: grammar.parse(tokens) self.assertEqual(cm.exception.offset, 3) - def test_grammar_choice_dict_init(self): - datas = [ + def test_grammar_choice_dict_init(self) -> None: + datas: list[tuple[collections.abc.Sequence[Pattern|str], str]] = [ ( ('WORD', 'WORD'), "First token kind must be unique, but WORD isn't." @@ -167,10 +189,10 @@ def test_grammar_choice_dict_init(self): self.assertEqual(str(cm.exception), message) - def test_grammar_delimited_list(self): + def test_grammar_delimited_list(self) -> None: grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'foo')], [['foo'], []] @@ -187,10 +209,10 @@ def test_grammar_delimited_list(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_delimited_list_mismatch(self): + def test_grammar_delimited_list_mismatch(self) -> None: grammar = Grammar(Sequence(DelimitedList('WORD'), Optional('.'))) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [ ('WORD', 'foo', 1), @@ -212,10 +234,10 @@ def test_grammar_delimited_list_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_zero_or_more(self): + def test_grammar_zero_or_more(self) -> None: grammar = Grammar(ZeroOrMore('WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [], [] @@ -232,11 +254,11 @@ def test_grammar_zero_or_more(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_zero_or_more_partial_element_match(self): + def test_grammar_zero_or_more_partial_element_match(self) -> None: grammar = Grammar(Sequence( ZeroOrMore(Sequence('WORD', 'NUMBER')), 'WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [ ('WORD', 'foo'), @@ -250,10 +272,10 @@ def test_grammar_zero_or_more_partial_element_match(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_zero_or_more_dict(self): + def test_grammar_zero_or_more_dict(self) -> None: grammar = Grammar(ZeroOrMoreDict(Sequence('WORD', 'NUMBER'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [], {} @@ -271,10 +293,10 @@ def test_grammar_zero_or_more_dict(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_one_or_more(self): + def test_grammar_one_or_more(self) -> None: grammar = Grammar(OneOrMore('WORD')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'foo')], ['foo'] @@ -287,10 +309,10 @@ def test_grammar_one_or_more(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_one_or_more_mismatch(self): + def test_grammar_one_or_more_mismatch(self) -> None: grammar = Grammar(OneOrMore('WORD')) - datas = [ + datas = cast('list[GrammarMismatchSpec]', [ ( [] , -1 @@ -299,14 +321,14 @@ def test_grammar_one_or_more_mismatch(self): [('NUMBER', 'foo', 2)], 2 ) - ] + ]) self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_one_or_more_dict(self): + def test_grammar_one_or_more_dict(self) -> None: grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'foo'), ('NUMBER', '1')], { @@ -326,10 +348,10 @@ def test_grammar_one_or_more_dict(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_one_or_more_dict_mismatch(self): + def test_grammar_one_or_more_dict_mismatch(self) -> None: grammar = Grammar(OneOrMoreDict(Sequence('WORD', 'NUMBER'))) - datas = [ + datas = cast('list[GrammarMismatchSpec]', [ ( [('WORD', 'foo', 5)], -1 @@ -350,14 +372,14 @@ def test_grammar_one_or_more_dict_mismatch(self): ], 8 ) - ] + ]) self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_any(self): + def test_grammar_any(self) -> None: grammar = Grammar(Any()) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('A', r'a')], 'a' @@ -370,10 +392,10 @@ def test_grammar_any(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_any_until(self): + def test_grammar_any_until(self) -> None: grammar = Grammar(Sequence(AnyUntil('STRING'), 'STRING')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1'), ('WORD', 'a'), @@ -384,12 +406,12 @@ def test_grammar_any_until(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_any_until_sequence(self): + def test_grammar_any_until_sequence(self) -> None: grammar = Grammar(Sequence(AnyUntil(Sequence('WORD', 'STRING')), 'WORD', 'STRING')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1'), ('WORD', 'a'), @@ -401,7 +423,7 @@ def test_grammar_any_until_sequence(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_1(self): + def test_grammar_1(self) -> None: grammar = Grammar(Sequence( 'IF', choice(Sequence(choice('A', 'B'), 'STRING'), @@ -412,7 +434,7 @@ def test_grammar_1(self): choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'), '.'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [ ('IF', 'IF'), @@ -437,7 +459,7 @@ def test_grammar_1(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_1_mismatch(self): + def test_grammar_1_mismatch(self) -> None: grammar = Grammar(Sequence( 'IF', choice(Sequence(choice('A', 'B'), 'STRING'), @@ -448,7 +470,7 @@ def test_grammar_1_mismatch(self): choice(DelimitedList('STRING'), ZeroOrMore('NUMBER')), '.'), '.'))) - datas = [ + datas = cast('list[GrammarMismatchSpec]', [ ( [ ('IF', 'IF', 1), @@ -483,16 +505,16 @@ def test_grammar_1_mismatch(self): ], 5 ) - ] + ]) self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_forward(self): + def test_grammar_forward(self) -> None: foo = Forward() foo <<= Sequence('FOO') grammar = Grammar(foo) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('FOO', 'foo')], ['foo'] @@ -501,12 +523,12 @@ def test_grammar_forward(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_forward_text(self): + def test_grammar_forward_text(self) -> None: foo = Forward() foo <<= 'FOO' grammar = Grammar(foo) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('FOO', 'foo')], 'foo' @@ -515,12 +537,12 @@ def test_grammar_forward_text(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_optional(self): + def test_grammar_optional(self) -> None: grammar = Grammar(Sequence(Optional('WORD'), Optional('WORD'), Optional('NUMBER'))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [], [[], [], []] @@ -545,33 +567,33 @@ def test_grammar_optional(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_tag(self): + def test_grammar_tag(self) -> None: grammar = Grammar(Tag('a', Tag('b', choice(Tag('c', 'WORD'), Tag('d', Optional('NUMBER')))))) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('WORD', 'bar')], - ('a', ('b', ('c', 'bar'))) + cast('MatchObject', ('a', ('b', ('c', 'bar')))) ), ( [('NUMBER', '1')], - ('a', ('b', ('d', ['1']))) + cast('MatchObject', ('a', ('b', ('d', ['1'])))) ), ( [], - ('a', ('b', ('d', []))) + cast('MatchObject', ('a', ('b', ('d', [])))) ) ] self.parse_and_assert_tree(grammar, datas) - def test_grammar_tag_mismatch(self): + def test_grammar_tag_mismatch(self) -> None: grammar = Grammar(Tag('a', 'WORD')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('NUMBER', 'bar')], 1 @@ -580,10 +602,10 @@ def test_grammar_tag_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_and(self): + def test_grammar_and(self) -> None: grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1')], [[], '1'] @@ -592,10 +614,10 @@ def test_grammar_and(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_and_mismatch(self): + def test_grammar_and_mismatch(self) -> None: grammar = Grammar(Sequence(And('NUMBER'), 'NUMBER')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('WORD', 'foo', 3), ('NUMBER', '1', 4)], 3 @@ -604,10 +626,10 @@ def test_grammar_and_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_not(self): + def test_grammar_not(self) -> None: grammar = Grammar(Sequence(Not('WORD'), 'NUMBER')) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1')], [[], '1'] @@ -616,10 +638,10 @@ def test_grammar_not(self): self.parse_and_assert_tree(grammar, datas) - def test_grammar_not_mismatch(self): + def test_grammar_not_mismatch(self) -> None: grammar = Grammar(Sequence(Not('WORD'), 'NUMBER')) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('WORD', 'foo', 3), ('NUMBER', '1', 4)], 3 @@ -628,10 +650,10 @@ def test_grammar_not_mismatch(self): self.parse_and_assert_mismatch(grammar, datas) - def test_grammar_no_match(self): + def test_grammar_no_match(self) -> None: grammar = Grammar(NoMatch()) - datas = [ + datas: list[GrammarMismatchSpec] = [ ( [('NUMBER', '1', 3)], 3 @@ -644,20 +666,20 @@ def test_grammar_no_match(self): self.parse_and_assert_mismatch(grammar, datas) - def test_parse_start_and_end_of_file(self): + def test_parse_start_and_end_of_file(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return Sequence('__SOF__', '__EOF__') + def grammar(self) -> Grammar: + return Grammar(Sequence('__SOF__', '__EOF__')) self.assertEqual(Parser().parse('', match_sof=True), ['__SOF__', '__EOF__']) - def test_parse_start_of_file_mismatch(self): + def test_parse_start_of_file_mismatch(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return Sequence('__EOF__') + def grammar(self) -> Grammar: + return Grammar(Sequence('__EOF__')) with self.assertRaises(textparser.ParseError) as cm: Parser().parse('123', match_sof=True) @@ -665,43 +687,46 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 1, column 1: ">>!<<123"') - def test_parse_end_of_file(self): + def test_parse_end_of_file(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return '__EOF__' + def grammar(self) -> Grammar: + return Grammar('__EOF__') self.assertEqual(Parser().parse('', match_sof=False), '__EOF__') - def test_grammar_none(self): + def test_grammar_none(self) -> None: class AnyAsNone(textparser.Pattern): - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject|MismatchSingleton: tokens.get_value() - return None + # the cast is a bit hacky because Pattern.match() is + # not supposed to return None. (this should possibly + # return textparser.MISMATCH) + return cast('MatchObject', None) grammar = Grammar(AnyAsNone()) - datas = [ + datas: list[GrammarMatchSpec] = [ ( [('NUMBER', '1')], - None + cast('MatchObject', None) ) ] self.parse_and_assert_tree(grammar, datas) - def test_grammar_error(self): + def test_grammar_error(self) -> None: grammar = Grammar(NoMatch()) - datas = [ + datas: list[list[tuple[str, str]|tuple[str, str, int]]] = [ [('NUMBER', '1', 3)], [('WORD', 'foo', 3)] ] - for tokens in datas: - tokens = tokenize(tokens) + for token_args in datas: + tokens = tokenize(token_args) with self.assertRaises(textparser.GrammarError) as cm: grammar.parse(tokens) @@ -710,8 +735,9 @@ def test_grammar_error(self): self.assertEqual(str(cm.exception), 'Invalid syntax at offset 3.') - def test_tokenize_error(self): - datas = [ + def test_tokenize_error(self) -> None: + # list of (offset, text, message) tuples + datas: list[tuple[int, str, str]] = [ (2, 'hej', 'Invalid syntax at line 1, column 3: "he>>!<>!<>!<<"'), @@ -726,8 +752,9 @@ def test_tokenize_error(self): self.assertEqual(cm.exception.offset, offset) self.assertEqual(str(cm.exception), message) - def test_create_token_re(self): - datas = [ + def test_create_token_re(self) -> None: + # list of (TokenTree, expected_regex) tuples + datas: list[tuple[TokenizeItems, str]] = [ ( [('A', r'a')], '(?Pa)' @@ -744,17 +771,17 @@ def test_create_token_re(self): [Token(kind='__SOF__', value='__SOF__', offset=0)]) self.assertEqual(re_token, expected_re_token) - def test_parser(self): + def test_parser(self) -> None: class Parser(textparser.Parser): - def keywords(self): - return set([ + def keywords(self) -> set[str]: + return { 'IF', 'A', 'B' - ]) + } - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), @@ -764,17 +791,17 @@ def token_specs(self): ('MISMATCH', r'.') ] - def grammar(self): - return Sequence( + def grammar(self) -> Grammar: + return Grammar(Sequence( 'IF', Optional(choice('A', 'B')), 'ESCAPED_STRING', 'WORD', Optional(choice(DelimitedList('ESCAPED_STRING'), ZeroOrMore('NUMBER'))), - '.') + '.')) - datas = [ + datas: list[tuple[str, MatchObject, MatchObject]] = [ ( 'IF "foo" bar .', ['IF', [], '"foo"', 'bar', [[]], '.'], @@ -814,10 +841,10 @@ def grammar(self): tree = Parser().parse(text, token_tree=True) self.assertEqual(tree, expected_token_tree) - def test_parser_default_keywords(self): + def test_parser_default_keywords(self) -> None: class Parser(textparser.Parser): - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), @@ -827,17 +854,18 @@ def token_specs(self): ('MISMATCH', r'.') ] - def grammar(self): - return Sequence( + def grammar(self) -> Grammar: + return Grammar(Sequence( 'WORD', Optional('WORD'), 'ESCAPED_STRING', 'WORD', Optional(choice(DelimitedList('ESCAPED_STRING'), ZeroOrMore('NUMBER'))), - '.') + '.')) - datas = [ + # list of (input_string, expected_flat_match, expected_tree_match) tuples + datas: list[tuple[str, MatchObject, MatchObject]] = [ ( 'IF "foo" bar .', ['IF', [], '"foo"', 'bar', [[]], '.'], @@ -877,7 +905,7 @@ def grammar(self): tree = Parser().parse(text, token_tree=True) self.assertEqual(tree, expected_token_tree) - def test_parser_bare(self): + def test_parser_bare(self) -> None: class Parser(textparser.Parser): pass @@ -887,26 +915,26 @@ class Parser(textparser.Parser): self.assertEqual(str(cm.exception), 'No grammar defined.') - def test_parser_default_token_specs(self): + def test_parser_default_token_specs(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return 'WORD' + def grammar(self) -> Grammar: + return Grammar('WORD') tree = Parser().parse('foo') self.assertEqual(tree, 'foo') - def test_parser_tokenize_mismatch(self): + def test_parser_tokenize_mismatch(self) -> None: class Parser(textparser.Parser): - def token_specs(self): + def token_specs(self) -> list[tuple[str, str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), ('MISMATCH', r'.') ] - def grammar(self): + def grammar(self) -> Grammar: return Grammar('NUMBER') with self.assertRaises(textparser.ParseError) as cm: @@ -918,17 +946,17 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 2, column 3: "34>>!< None: class Parser(textparser.Parser): - def tokenize(self, _text): + def tokenize(self, _text: str) -> list[Token]: return tokenize([ ('NUMBER', '1.45', 0), ('NUMBER', '2', 5) ]) - def grammar(self): - return Sequence('NUMBER', 'WORD') + def grammar(self) -> Grammar: + return Grammar(Sequence('NUMBER', 'WORD')) with self.assertRaises(textparser.ParseError) as cm: Parser().parse('1.45 2') @@ -939,18 +967,18 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 1, column 6: "1.45 >>!<<2"') - def test_parser_grammar_mismatch_choice_max(self): + def test_parser_grammar_mismatch_choice_max(self) -> None: class Parser(textparser.Parser): - def __init__(self, tokens): + def __init__(self, tokens: TokenizeItems) -> None: self._tokens = tokens - def tokenize(self, _text): + def tokenize(self, _text: str) -> list[Token]: return tokenize(self._tokens, add_eof_token=False) - def grammar(self): - return Choice(Sequence('NUMBER', 'WORD'), - 'WORD') + def grammar(self) -> Grammar: + return Grammar(Choice(Sequence('NUMBER', 'WORD'), + 'WORD')) Data = namedtuple('Data', [ @@ -995,13 +1023,13 @@ def grammar(self): self.assertEqual(cm.exception.column, column) self.assertEqual(str(cm.exception), message) - def test_parse_error(self): + def test_parse_error(self) -> None: class Parser(textparser.Parser): - def tokenize(self, text): + def tokenize(self, text: str) -> list[Token]: raise TokenizeError(text, 5) - def grammar(self): + def grammar(self) -> Grammar: return Grammar(Sequence('NUMBER', 'WORD')) with self.assertRaises(textparser.ParseError) as cm: @@ -1014,7 +1042,7 @@ def grammar(self): self.assertEqual(str(cm.exception), 'Invalid syntax at line 2, column 3: "34>>!<<56"') - def test_markup_line(self): + def test_markup_line(self) -> None: datas = [ (0, '>>!<<0', None), (1, '0>>!<<', None), @@ -1037,7 +1065,7 @@ def test_markup_line(self): self.assertEqual(text, line) - def test_replace_blocks(self): + def test_replace_blocks(self) -> None: datas = [ ('{}', '{}'), ('{{}}', '{ }'), @@ -1049,7 +1077,7 @@ def test_replace_blocks(self): new = replace_blocks(old) self.assertEqual(new, expected) - def test_replace_blocks_start_end(self): + def test_replace_blocks_start_end(self) -> None: datas = [ ('1[a]2[b]3', '1[ ]2[ ]3', '[', ']'), ('1{a}2{b}3', '1{ }2{ }3', '{', '}'), @@ -1061,13 +1089,13 @@ def test_replace_blocks_start_end(self): new = replace_blocks(old, start, end) self.assertEqual(new, expected) - def test_any_zero_or_more(self): + def test_any_zero_or_more(self) -> None: class Parser(textparser.Parser): - def keywords(self): - return ['interesting_group'] + def keywords(self) -> set[str]: + return {'interesting_group'} - def token_specs(self): + def token_specs(self) -> list[tuple[str,str]|tuple[str,str,str]]: return [ ('SKIP', r'[ \r\n\t]+'), ('WORD', r'[A-Za-z0-9_]+'), @@ -1077,16 +1105,16 @@ def token_specs(self): ('EQUAL', '=', r'='), ] - def grammar(self): + def grammar(self) -> Grammar: interesting_group = textparser.Sequence( 'interesting_group', '{', ZeroOrMore(Sequence('WORD', '=', 'WORD', ';')), '}', ';') - return Sequence(AnyUntil('interesting_group'), - interesting_group, - ZeroOrMore(Any())) + return Grammar(Sequence(AnyUntil('interesting_group'), + interesting_group, + ZeroOrMore(Any()))) text = ''' @@ -1105,6 +1133,7 @@ def grammar(self): ''' tree = Parser().parse(text) + assert isinstance(tree, list) self.assertEqual(tree[1], [ 'interesting_group', @@ -1116,11 +1145,11 @@ def grammar(self): '}', ';']) - def test_error_picklable(self): + def test_error_picklable(self) -> None: class Parser(textparser.Parser): - def grammar(self): - return Sequence('__EOF__') + def grammar(self) -> Grammar: + return Grammar(Sequence('__EOF__')) try: Parser().parse('123', match_sof=True) diff --git a/textparser.py b/textparser/__init__.py similarity index 69% rename from textparser.py rename to textparser/__init__.py index 8d76d72..4f1a234 100644 --- a/textparser.py +++ b/textparser/__init__.py @@ -1,120 +1,134 @@ # A text parser. +import collections.abc import re -from collections import namedtuple +import typing +from dataclasses import dataclass +from enum import Enum, auto from operator import itemgetter - __author__ = 'Erik Moqvist' -__version__ = '0.24.0' - +from .version import __version__ # noqa: F401 -class _Mismatch(object): - pass +class _Mismatch(Enum): + MISMATCH = auto() -MISMATCH = _Mismatch() +MISMATCH = _Mismatch.MISMATCH """Returned by :func:`~textparser.Pattern.match()` on mismatch. """ +MismatchSingleton = typing.Literal[_Mismatch.MISMATCH] -class _String(object): - """Matches a specific token kind. - - """ - - def __init__(self, kind): - self.kind = kind - - def match(self, tokens): - if self.kind == tokens.peek().kind: - return tokens.get_value() - else: - return MISMATCH - +@dataclass(slots=True, frozen=True, eq=True) +class Token: + kind: str + value: str | None + offset: int -class _Tokens(object): +class _Tokens: - def __init__(self, tokens): + def __init__(self, tokens: list[Token]): self._tokens = tokens self._pos = 0 self._max_pos = -1 - self._stack = [] + self._stack: list[int] = [] - def get_value(self): + def get_value(self) -> Token | str: pos = self._pos self._pos += 1 return self._tokens[pos] - def peek(self): + def peek(self) -> Token: return self._tokens[self._pos] - def peek_max(self): + def peek_max(self) -> Token: pos = self._pos - if self._max_pos > pos: - pos = self._max_pos + pos = max(pos, self._max_pos) if pos >= len(self._tokens): return self._tokens[-1] else: return self._tokens[pos] - def save(self): + def save(self) -> None: self._stack.append(self._pos) - def restore(self): + def restore(self) -> None: self._pos = self._stack.pop() - def update(self): + def update(self) -> None: self._stack[-1] = self._pos - def mark_max_restore(self): - if self._pos > self._max_pos: - self._max_pos = self._pos + def mark_max_restore(self) -> None: + self._max_pos = max(self._max_pos, self._pos) self._pos = self._stack.pop() - def mark_max_load(self): - if self._pos > self._max_pos: - self._max_pos = self._pos + def mark_max_load(self) -> None: + self._max_pos = max(self._max_pos, self._pos) self._pos = self._stack[-1] - def drop(self): + def drop(self) -> None: self._stack.pop() - def __repr__(self): + def __repr__(self) -> str: return str(self._tokens[self._pos:self._pos + 2]) +MatchObject = list["MatchObject"] | dict[str, list["MatchObject"]] | tuple[str,"MatchObject"] | Token | str + +class Pattern: + """Base class of all patterns. + + """ + + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else + on match. + + """ + + raise NotImplementedError('To be implemented by subclasses.') + +class _String(Pattern): + """Matches a specific token kind. + + """ + + def __init__(self, kind: str) -> None: + self.kind = kind + + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + if self.kind == tokens.peek().kind: + return tokens.get_value() + else: + return MISMATCH class _StringTokens(_Tokens): - def get_value(self): + def get_value(self) -> Token | str: pos = self._pos self._pos += 1 - return self._tokens[pos].value + return typing.cast('str', self._tokens[pos].value) -def _wrap_string(item): +def _wrap_string(item: Pattern | str) -> Pattern: if isinstance(item, str): item = _String(item) return item - -def _wrap_strings(items): +def _wrap_strings(items: collections.abc.Sequence[Pattern | str]) -> list[Pattern]: return [_wrap_string(item) for item in items] -def _format_invalid_syntax(text, offset): - return 'Invalid syntax at line {}, column {}: "{}"'.format( - line(text, offset), - column(text, offset), - markup_line(text, offset)) +def _format_invalid_syntax(text: str, offset: int) -> str: + return f'Invalid syntax at line {line(text, offset)}, column {column(text, offset)}: "{markup_line(text, offset)}"' class Error(Exception): @@ -122,7 +136,6 @@ class Error(Exception): """ - pass class TokenizeError(Error): @@ -131,14 +144,14 @@ class TokenizeError(Error): """ - def __init__(self, text, offset): + def __init__(self, text: str, offset: int) -> None: self._text = text self._offset = offset message = _format_invalid_syntax(text, offset) - super(TokenizeError, self).__init__(message) + super().__init__(message) @property - def text(self): + def text(self) -> str: """The input text to the tokenizer. """ @@ -146,7 +159,7 @@ def text(self): return self._text @property - def offset(self): + def offset(self) -> int: """Offset into the text where the tokenizer failed. """ @@ -160,13 +173,13 @@ class GrammarError(Error): """ - def __init__(self, offset): + def __init__(self, offset: int) -> None: self._offset = offset - message = 'Invalid syntax at offset {}.'.format(offset) - super(GrammarError, self).__init__(message) + message = f'Invalid syntax at offset {offset}.' + super().__init__(message) @property - def offset(self): + def offset(self) -> int: """Offset into the text where the parser failed. """ @@ -179,16 +192,16 @@ class ParseError(Error): """ - def __init__(self, text, offset): + def __init__(self, text: str, offset: int): self._text = text self._offset = offset self._line = line(text, offset) self._column = column(text, offset) message = _format_invalid_syntax(text, offset) - super(ParseError, self).__init__(message) + super().__init__(message) @property - def text(self): + def text(self) -> str: """The input text to the parser. """ @@ -196,7 +209,7 @@ def text(self): return self._text @property - def offset(self): + def offset(self) -> int: """Offset into the text where the parser failed. """ @@ -204,7 +217,7 @@ def offset(self): return self._offset @property - def line(self): + def line(self) -> int: """Line where the parser failed. """ @@ -212,45 +225,27 @@ def line(self): return self._line @property - def column(self): + def column(self) -> int: """Column where the parser failed. """ return self._column - def __reduce__(self): + def __reduce__(self) -> tuple[typing.Any, ...]: """Adds pickling support.""" return type(self), (self._text, self._offset), {} - -Token = namedtuple('Token', ['kind', 'value', 'offset']) - - -class Pattern(object): - """Base class of all patterns. - - """ - - def match(self, tokens): - """Returns :data:`~textparser.MISMATCH` on mismatch, and anything else - on match. - - """ - - raise NotImplementedError('To be implemented by subclasses.') - - class Sequence(Pattern): """Matches a sequence of patterns. Becomes a list in the parse tree. """ - def __init__(self, *patterns): + def __init__(self, *patterns: Pattern | str) -> None: self.patterns = _wrap_strings(patterns) - def match(self, tokens): - matched = [] + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + matched: list[MatchObject] = [] for pattern in self.patterns: mo = pattern.match(tokens) @@ -269,10 +264,10 @@ class Choice(Pattern): """ - def __init__(self, *patterns): + def __init__(self, *patterns: Pattern | str) -> None: self._patterns = _wrap_strings(patterns) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() for pattern in self._patterns: @@ -288,6 +283,55 @@ def match(self, tokens): return MISMATCH +class Tag(Pattern): + """Tags any matched `pattern` with name `name`. Becomes a two-tuple of + `name` and match in the parse tree. + + """ + + def __init__(self, name: str, pattern: Pattern | str) -> None: + self._name = name + self._pattern = _wrap_string(pattern) + + @property + def pattern(self) -> Pattern: + return self._pattern + + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + mo = self._pattern.match(tokens) + + if mo is not MISMATCH: + return (self._name, mo) + else: + return MISMATCH + + +class Forward(Pattern): + """Forward declaration of a pattern. + + .. code-block:: python + + >>> foo = Forward() + >>> foo <<= Sequence('NUMBER') + + """ + + def __init__(self) -> None: + self._pattern: Pattern | None = None + + @property + def pattern(self) -> Pattern | None: + return self._pattern + + def __ilshift__(self, other: Pattern | str) -> "Forward": + self._pattern = _wrap_string(other) + + return self + + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + if self._pattern is not None: + return self._pattern.match(tokens) + return MISMATCH class ChoiceDict(Pattern): """Matches any of given patterns. The first token kind of all patterns @@ -299,40 +343,42 @@ class ChoiceDict(Pattern): """ - def __init__(self, *patterns): - self._patterns_map = {} - patterns = _wrap_strings(patterns) + def __init__(self, *patterns: Pattern | str) -> None: + self._patterns_map: dict[str, Pattern] = {} + wrapped_patterns = _wrap_strings(patterns) - for pattern in patterns: + for pattern in wrapped_patterns: self._check_pattern(pattern, pattern) @property - def patterns_map(self): + def patterns_map(self) -> dict[str, Pattern]: return self._patterns_map - def _check_pattern(self, inner, outer): + def _check_pattern(self, inner: Pattern, outer: Pattern) -> None: if isinstance(inner, _String): self._add_pattern(inner.kind, outer) elif isinstance(inner, Sequence): self._check_pattern(inner.patterns[0], outer) elif isinstance(inner, (Tag, Forward)): + if inner.pattern is None: + raise Error( + f'No inner pattern defined for {type(inner)}.') self._check_pattern(inner.pattern, outer) elif isinstance(inner, ChoiceDict): for pattern in inner.patterns_map.values(): self._check_pattern(pattern, outer) else: raise Error( - 'Unsupported pattern type {}.'.format(type(inner))) + f'Unsupported pattern type {type(inner)}.') - def _add_pattern(self, kind, pattern): + def _add_pattern(self, kind: str, pattern: Pattern) -> None: if kind in self._patterns_map: raise Error( - "First token kind must be unique, but {} isn't.".format( - kind)) + f"First token kind must be unique, but {kind} isn't.") self._patterns_map[kind] = pattern - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: kind = tokens.peek().kind if kind in self._patterns_map: @@ -347,11 +393,11 @@ class Repeated(Pattern): """ - def __init__(self, pattern, minimum=0): + def __init__(self, pattern: Pattern | str, minimum: int=0) -> None: self._pattern = _wrap_string(pattern) self._minimum = minimum - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: matched = [] tokens.save() @@ -381,16 +427,16 @@ class RepeatedDict(Repeated): """ - def __init__(self, pattern, minimum=0, key=None): - super(RepeatedDict, self).__init__(pattern, minimum) + def __init__(self, pattern: Pattern | str, minimum: int=0, key: typing.Callable[[MatchObject], str] | None=None) -> None: + super().__init__(pattern, minimum) if key is None: - key = itemgetter(0) + key = typing.cast('typing.Callable[[MatchObject], str]', itemgetter(0)) self._key = key - def match(self, tokens): - matched = {} + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + matched: dict[str, list[MatchObject]] = {} tokens.save() while True: @@ -422,8 +468,8 @@ class ZeroOrMore(Repeated): """ - def __init__(self, pattern): - super(ZeroOrMore, self).__init__(pattern, 0) + def __init__(self, pattern: Pattern | str) -> None: + super().__init__(pattern, 0) class ZeroOrMoreDict(RepeatedDict): @@ -433,8 +479,8 @@ class ZeroOrMoreDict(RepeatedDict): """ - def __init__(self, pattern, key=None): - super(ZeroOrMoreDict, self).__init__(pattern, 0, key) + def __init__(self, pattern: Pattern | str, key: typing.Callable[[MatchObject], str] | None=None) -> None: + super().__init__(pattern, 0, key) class OneOrMore(Repeated): @@ -444,8 +490,8 @@ class OneOrMore(Repeated): """ - def __init__(self, pattern): - super(OneOrMore, self).__init__(pattern, 1) + def __init__(self, pattern: Pattern | str) -> None: + super().__init__(pattern, 1) class OneOrMoreDict(RepeatedDict): @@ -455,8 +501,8 @@ class OneOrMoreDict(RepeatedDict): """ - def __init__(self, pattern, key=None): - super(OneOrMoreDict, self).__init__(pattern, 1, key) + def __init__(self, pattern: Pattern | str, key: typing.Callable[[MatchObject], str] | None=None) -> None: + super().__init__(pattern, 1, key) class DelimitedList(Pattern): @@ -466,11 +512,11 @@ class DelimitedList(Pattern): """ - def __init__(self, pattern, delim=','): + def __init__(self, pattern: Pattern | str, delim: str=',') -> None: self._pattern = _wrap_string(pattern) self._delim = _wrap_string(delim) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: # First pattern. mo = self._pattern.match(tokens) @@ -507,10 +553,10 @@ class Optional(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() mo = self._pattern.match(tokens) @@ -529,7 +575,7 @@ class Any(Pattern): """ - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: if tokens.peek().kind == '__EOF__': return MISMATCH else: @@ -542,11 +588,11 @@ class AnyUntil(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): - matched = [] + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: + matched: list[MatchObject] = [] while True: tokens.save() @@ -569,10 +615,10 @@ class And(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() mo = self._pattern.match(tokens) tokens.restore() @@ -591,10 +637,10 @@ class Not(Pattern): """ - def __init__(self, pattern): + def __init__(self, pattern: Pattern | str) -> None: self._pattern = _wrap_string(pattern) - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: tokens.save() mo = self._pattern.match(tokens) tokens.restore() @@ -610,75 +656,26 @@ class NoMatch(Pattern): """ - def match(self, tokens): + def match(self, tokens: _Tokens) -> MatchObject | MismatchSingleton: return MISMATCH - -class Tag(Pattern): - """Tags any matched `pattern` with name `name`. Becomes a two-tuple of - `name` and match in the parse tree. - - """ - - def __init__(self, name, pattern): - self._name = name - self._pattern = _wrap_string(pattern) - - @property - def pattern(self): - return self._pattern - - def match(self, tokens): - mo = self._pattern.match(tokens) - - if mo is not MISMATCH: - return (self._name, mo) - else: - return MISMATCH - - -class Forward(Pattern): - """Forward declaration of a pattern. - - .. code-block:: python - - >>> foo = Forward() - >>> foo <<= Sequence('NUMBER') - - """ - - def __init__(self): - self._pattern = None - - @property - def pattern(self): - return self._pattern - - def __ilshift__(self, other): - self._pattern = _wrap_string(other) - - return self - - def match(self, tokens): - return self._pattern.match(tokens) - - -class Grammar(object): +class Grammar: """Creates a tree of given tokens using the grammar `grammar`. """ - def __init__(self, grammar): + def __init__(self, grammar: Pattern | str) -> None: + self._root: Pattern if isinstance(grammar, str): - grammar = _wrap_string(grammar) - - self._root = grammar + self._root = _wrap_string(grammar) + else: + self._root = grammar - def parse(self, tokens, token_tree=False): + def parse(self, token_list: list[Token], token_tree: bool=False) -> MatchObject: if token_tree: - tokens = _Tokens(tokens) + tokens = _Tokens(token_list) else: - tokens = _StringTokens(tokens) + tokens = _StringTokens(token_list) parsed = self._root.match(tokens) @@ -688,7 +685,7 @@ def parse(self, tokens, token_tree=False): raise GrammarError(tokens.peek_max().offset) -def choice(*patterns): +def choice(*patterns: Pattern | str) -> Choice | ChoiceDict: """Returns an instance of the fastest choice class for given patterns `patterns`. It is recommended to use this function instead of instantiate :class:`~textparser.Choice` or @@ -702,7 +699,7 @@ def choice(*patterns): return Choice(*patterns) -def markup_line(text, offset, marker='>>!<<'): +def markup_line(text: str, offset: int, marker: str='>>!<<') -> str: """Insert `marker` at `offset` into `text`, and return the marked line. @@ -724,17 +721,17 @@ def markup_line(text, offset, marker='>>!<<'): return text[begin:offset] + marker + text[offset:end] -def line(text, offset): +def line(text: str, offset: int) -> int: return text[:offset].count('\n') + 1 -def column(text, offset): +def column(text: str, offset: int) -> int: line_start = text.rfind('\n', 0, offset) return offset - line_start -def tokenize_init(spec): +def tokenize_init(spec: collections.abc.Sequence[tuple[str, str] | tuple[str, str, int]]) -> tuple[list[Token], str]: """Initialize a tokenizer. Should only be called by the :func:`~textparser.Parser.tokenize` method in the parser. @@ -742,13 +739,13 @@ def tokenize_init(spec): tokens = [Token('__SOF__', '__SOF__', 0)] re_token = '|'.join([ - '(?P<{}>{})'.format(name, regex) for name, regex in spec + f'(?P<{token_spec[0]}>{token_spec[1]})' for token_spec in spec ]) return tokens, re_token -class Parser(object): +class Parser: """The abstract base class of all text parsers. .. code-block:: python @@ -768,7 +765,8 @@ class Parser(object): """ - def _unpack_token_specs(self): + def _unpack_token_specs(self) -> tuple[dict[str, str], + list[tuple[str,str]]]: names = {} specs = [] @@ -781,7 +779,7 @@ def _unpack_token_specs(self): return names, specs - def keywords(self): + def keywords(self) -> set[str]: """A set of keywords in the text. .. code-block:: python @@ -793,7 +791,7 @@ def keywords(self): return set() - def token_specs(self): + def token_specs(self) -> list[tuple[str, str] | tuple[str, str, str]]: """The token specifications with token name, regular expression, and optionally a user friendly name. @@ -813,7 +811,7 @@ def token_specs(self): ('MISMATCH', r'.') ] - def tokenize(self, text): + def tokenize(self, text: str) -> list[Token]: """Tokenize given string `text`, and return a list of tokens. Raises :class:`~textparser.TokenizeError` on failure. @@ -830,6 +828,7 @@ def tokenize(self, text): for mo in re.finditer(re_token, text, re.DOTALL): kind = mo.lastgroup + assert isinstance(kind, str) if kind == 'SKIP': pass @@ -848,7 +847,7 @@ def tokenize(self, text): return tokens - def grammar(self): + def grammar(self) -> Grammar: """The text grammar is used to create a parse tree out of a list of tokens. @@ -858,7 +857,7 @@ def grammar(self): raise NotImplementedError('No grammar defined.') - def parse(self, text, token_tree=False, match_sof=False): + def parse(self, text: str, token_tree: bool=False, match_sof:bool=False) -> MatchObject | MismatchSingleton: """Parse given string `text` and return the parse tree. Raises :class:`~textparser.ParseError` on failure. @@ -888,12 +887,19 @@ def parse(self, text, token_tree=False, match_sof=False): if len(tokens) > 0 and tokens[0].kind == '__SOF__': del tokens[0] - return Grammar(self.grammar()).parse(tokens, token_tree) + grammar = self.grammar() + if isinstance(grammar, Grammar): + return grammar.parse(tokens, token_tree) + else: + # used for compatibility with old user code from the + # pre-type hints era... + return Grammar(grammar).parse(tokens, token_tree) + except (TokenizeError, GrammarError) as e: - raise ParseError(text, e.offset) + raise ParseError(text, e.offset) from e -def replace_blocks(string, start='{', end='}'): +def replace_blocks(string: str, start: str='{', end: str='}') -> str: """Replace all blocks starting with `start` and ending with `end` with spaces (not including `start` and `end`). @@ -903,7 +909,7 @@ def replace_blocks(string, start='{', end='}'): begin = 0 depth = 0 start_length = len(start) - pattern = r'({}|{})'.format(re.escape(start), re.escape(end)) + pattern = rf'({re.escape(start)}|{re.escape(end)})' for mo in re.finditer(pattern, string): pos = mo.start()