From c8456cc0708b58fd95c88d4e5f9003c49af1d1ee Mon Sep 17 00:00:00 2001 From: Jack Oberman Date: Tue, 2 Jun 2026 18:52:57 -0400 Subject: [PATCH 01/12] feat: Add optional sqlalchemy_use_enum for dy.Enum Allow dy.Enum to emit sqlalchemy.Enum for to_sqlalchemy_columns when sqlalchemy_use_enum=True, with optional sqlalchemy_enum_name and column- name defaults for PostgreSQL native enum types. Closes #354 Co-authored-by: Cursor --- dataframely/columns/enum.py | 52 ++++++++++++++++++++++ docs/guides/features/sql-generation.md | 21 +++++++++ tests/column_types/test_enum.py | 40 +++++++++++++++++ tests/columns/test_sqlalchemy_columns.py | 56 ++++++++++++++++++++++++ 4 files changed, 169 insertions(+) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index 6c63f2c..e77fa6e 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -33,6 +33,8 @@ def __init__( alias: str | None = None, metadata: dict[str, Any] | None = None, description: str | None = None, + sqlalchemy_use_enum: bool = False, + sqlalchemy_enum_name: str | None = None, ): """ Args: @@ -68,6 +70,15 @@ def __init__( names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. description: A human-readable description of the column. + sqlalchemy_use_enum: When ``True``, map this column to :class:`sqlalchemy.Enum` + in :meth:`~dataframely.Schema.to_sqlalchemy_columns` instead of + ``CHAR`` / ``VARCHAR``. Use this for PostgreSQL native enum types and + Alembic schema drift detection. Defaults to ``False`` (string columns). + sqlalchemy_enum_name: Optional name for the SQLAlchemy / database enum type + when ``sqlalchemy_use_enum=True``. If omitted and ``categories`` is a + Python :class:`enum.Enum` subclass, SQLAlchemy uses the enum class name + (lowercased). Otherwise the SQL column name from + :meth:`~dataframely.Schema.to_sqlalchemy_columns` is used. """ super().__init__( nullable=nullable, @@ -78,7 +89,11 @@ def __init__( metadata=metadata, description=description, ) + self.sqlalchemy_use_enum = sqlalchemy_use_enum + self.sqlalchemy_enum_name = sqlalchemy_enum_name + self._enum_class: type[enum.Enum] | None = None if isclass(categories) and issubclass(categories, enum.Enum): + self._enum_class = categories categories = (item.value for item in categories) self.categories = list(categories) @@ -91,12 +106,49 @@ def validate_dtype(self, dtype: PolarsDataType) -> bool: return False return self.categories == dtype.categories.to_list() + def sqlalchemy_column(self, name: str, dialect: sa.Dialect) -> sa.Column: + if self.sqlalchemy_use_enum: + return sa.Column( + name, + self._sqlalchemy_enum_type(dialect, column_name=name), + nullable=self.nullable, + primary_key=self.primary_key, + unique=self.unique, + autoincrement=False, + ) + return super().sqlalchemy_column(name, dialect) + def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: + if self.sqlalchemy_use_enum: + column_name = self._name or None + return self._sqlalchemy_enum_type(dialect, column_name=column_name) category_lengths = [len(c) for c in self.categories] if all(length == category_lengths[0] for length in category_lengths): return sa.CHAR(category_lengths[0]) return sa.String(max(category_lengths)) + def _sqlalchemy_enum_type( + self, _dialect: sa.Dialect, *, column_name: str | None + ) -> sa_TypeEngine: + length = max(len(c) for c in self.categories) + kwargs: dict[str, Any] = {"length": length} + name = self.sqlalchemy_enum_name + if self._enum_class is not None: + if name is not None: + kwargs["name"] = name + return sa.Enum(self._enum_class, **kwargs) + if name is None: + name = column_name + if name is None: + raise ValueError( + "sqlalchemy_enum_name is required for dy.Enum with string categories " + "and sqlalchemy_use_enum=True when not building columns via " + "Schema.to_sqlalchemy_columns(). Alternatively, pass a Python " + "enum.Enum class as categories." + ) + kwargs["name"] = name + return sa.Enum(*self.categories, **kwargs) + @property def pyarrow_dtype(self) -> pa.DataType: if len(self.categories) <= 2**8 - 1: diff --git a/docs/guides/features/sql-generation.md b/docs/guides/features/sql-generation.md index e84d2fd..fe7b11b 100644 --- a/docs/guides/features/sql-generation.md +++ b/docs/guides/features/sql-generation.md @@ -81,6 +81,27 @@ the maximal length of the string is inferred from the regular expression if poss maximal lengths can be particularly important for primary key columns. Some database systems, such as Microsoft SQL Server, do not allow `VARCHAR(max)` columns (unbounded strings) to be used as primary keys. ``` +## Native SQL enums (optional) + +By default, {class}`~dataframely.Enum` maps to fixed-length `CHAR` or `VARCHAR` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), set `sqlalchemy_use_enum=True`: + +```python +from enum import StrEnum + +import dataframely as dy + + +class Status(StrEnum): + PENDING = "pending" + APPROVED = "approved" + + +class Staged(dy.Schema): + status = dy.Enum(Status, sqlalchemy_use_enum=True) +``` + +When `categories` is a Python `enum.Enum` subclass, SQLAlchemy uses the enum class name (lowercased) as the database enum type name. For string category lists, the SQL column name is used by default; override it with `sqlalchemy_enum_name` if needed. On dialects without native enums (such as Microsoft SQL Server), SQLAlchemy falls back to `VARCHAR` with a check constraint. + ## Collections of multiple tables If you have an entire `dy.Collection`, it's also easy to generate one table for each member table of the collection. diff --git a/tests/column_types/test_enum.py b/tests/column_types/test_enum.py index 85078a3..9de0103 100644 --- a/tests/column_types/test_enum.py +++ b/tests/column_types/test_enum.py @@ -108,3 +108,43 @@ def test_sequences_and_enums( S = create_schema("test", {"x": dy.Enum(categories1)}) df = pl.DataFrame({"x": pl.Series(["a", "b"], dtype=pl.Enum(categories2))}) S.validate(df) + + +def test_matches_sqlalchemy_use_enum() -> None: + expr = pl.element() + assert dy.Enum(["a", "b"]).matches(dy.Enum(["a", "b"]), expr) + assert not dy.Enum(["a", "b"], sqlalchemy_use_enum=True).matches( + dy.Enum(["a", "b"]), expr + ) + assert dy.Enum(["a", "b"], sqlalchemy_use_enum=True).matches( + dy.Enum(["a", "b"], sqlalchemy_use_enum=True), expr + ) + + +def test_matches_sqlalchemy_enum_name() -> None: + expr = pl.element() + assert not dy.Enum( + ["a", "b"], + sqlalchemy_use_enum=True, + sqlalchemy_enum_name="one", + ).matches( + dy.Enum( + ["a", "b"], + sqlalchemy_use_enum=True, + sqlalchemy_enum_name="two", + ), + expr, + ) + + +def test_as_dict_from_dict_sqlalchemy_enum_flags() -> None: + column = dy.Enum( + ["a", "b"], + sqlalchemy_use_enum=True, + sqlalchemy_enum_name="my_enum", + ) + data = column.as_dict(pl.element()) + restored = dy.Enum.from_dict(data) + assert restored.sqlalchemy_use_enum is True + assert restored.sqlalchemy_enum_name == "my_enum" + assert restored.categories == ["a", "b"] diff --git a/tests/columns/test_sqlalchemy_columns.py b/tests/columns/test_sqlalchemy_columns.py index 6731202..af25070 100644 --- a/tests/columns/test_sqlalchemy_columns.py +++ b/tests/columns/test_sqlalchemy_columns.py @@ -1,6 +1,8 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +from enum import Enum + import pytest import dataframely as dy @@ -171,3 +173,57 @@ def test_raise_for_object_column(dialect: Dialect) -> None: NotImplementedError, match="SQL column cannot have 'Object' type." ): dy.Object().sqlalchemy_dtype(dialect) + + +class _Status(str, Enum): + PENDING = "pending" + APPROVED = "approved" + + +@pytest.mark.parametrize( + ("column", "dialect", "datatype"), + [ + ( + dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True), + PGDialect_psycopg2(), + "a", + ), + ( + dy.Enum( + ["foo", "bar"], + sqlalchemy_use_enum=True, + sqlalchemy_enum_name="my_status", + ), + PGDialect_psycopg2(), + "my_status", + ), + (dy.Enum(_Status, sqlalchemy_use_enum=True), PGDialect_psycopg2(), "_status"), + ( + dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True), + MSDialect_pyodbc(), + "VARCHAR(3)", + ), + ], +) +def test_enum_sqlalchemy_native(column: Column, dialect: Dialect, datatype: str) -> None: + schema = create_schema("test", {"a": column}) + columns = schema.to_sqlalchemy_columns(dialect) + assert len(columns) == 1 + assert columns[0].type.compile(dialect) == datatype + + +def test_enum_sqlalchemy_native_string_categories_use_column_name() -> None: + class TestSchema(dy.Schema): + status = dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True) + + column = TestSchema.columns()["status"] + assert column.sqlalchemy_dtype(PGDialect_psycopg2()).compile( + PGDialect_psycopg2() + ) == "status" + + +def test_enum_sqlalchemy_native_string_categories_requires_name_without_column( +) -> None: + column = dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True) + with pytest.raises(ValueError, match="sqlalchemy_enum_name is required"): + column.sqlalchemy_dtype(PGDialect_psycopg2()) From 07f5f43b3c2300207a0fd7b3d583b6d53709e7f1 Mon Sep 17 00:00:00 2001 From: Jack Oberman Date: Tue, 2 Jun 2026 19:01:31 -0400 Subject: [PATCH 02/12] fix: Align SQLAlchemy Enum values with dy.Enum categories --- dataframely/columns/enum.py | 16 +++++++--------- docs/guides/features/sql-generation.md | 4 ++-- tests/columns/test_sqlalchemy_columns.py | 7 +++++++ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index e77fa6e..f1d2699 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -78,7 +78,9 @@ def __init__( when ``sqlalchemy_use_enum=True``. If omitted and ``categories`` is a Python :class:`enum.Enum` subclass, SQLAlchemy uses the enum class name (lowercased). Otherwise the SQL column name from - :meth:`~dataframely.Schema.to_sqlalchemy_columns` is used. + :meth:`~dataframely.Schema.to_sqlalchemy_columns` is used. For Python + enums, persisted values are the enum members' ``.value`` strings (not + member names), matching :attr:`categories`. """ super().__init__( nullable=nullable, @@ -108,14 +110,9 @@ def validate_dtype(self, dtype: PolarsDataType) -> bool: def sqlalchemy_column(self, name: str, dialect: sa.Dialect) -> sa.Column: if self.sqlalchemy_use_enum: - return sa.Column( - name, - self._sqlalchemy_enum_type(dialect, column_name=name), - nullable=self.nullable, - primary_key=self.primary_key, - unique=self.unique, - autoincrement=False, - ) + column = super().sqlalchemy_column(name, dialect) + column.type = self._sqlalchemy_enum_type(dialect, column_name=name) + return column return super().sqlalchemy_column(name, dialect) def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: @@ -136,6 +133,7 @@ def _sqlalchemy_enum_type( if self._enum_class is not None: if name is not None: kwargs["name"] = name + kwargs["values_callable"] = lambda enum: [member.value for member in enum] return sa.Enum(self._enum_class, **kwargs) if name is None: name = column_name diff --git a/docs/guides/features/sql-generation.md b/docs/guides/features/sql-generation.md index fe7b11b..696e01e 100644 --- a/docs/guides/features/sql-generation.md +++ b/docs/guides/features/sql-generation.md @@ -86,12 +86,12 @@ maximal lengths can be particularly important for primary key columns. Some data By default, {class}`~dataframely.Enum` maps to fixed-length `CHAR` or `VARCHAR` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), set `sqlalchemy_use_enum=True`: ```python -from enum import StrEnum +from enum import Enum import dataframely as dy -class Status(StrEnum): +class Status(str, Enum): PENDING = "pending" APPROVED = "approved" diff --git a/tests/columns/test_sqlalchemy_columns.py b/tests/columns/test_sqlalchemy_columns.py index af25070..9cd8fcf 100644 --- a/tests/columns/test_sqlalchemy_columns.py +++ b/tests/columns/test_sqlalchemy_columns.py @@ -212,6 +212,13 @@ def test_enum_sqlalchemy_native(column: Column, dialect: Dialect, datatype: str) assert columns[0].type.compile(dialect) == datatype +def test_enum_sqlalchemy_native_python_enum_uses_member_values() -> None: + column = dy.Enum(_Status, sqlalchemy_use_enum=True) + schema = create_schema("test", {"a": column}) + sa_type = schema.to_sqlalchemy_columns(PGDialect_psycopg2())[0].type + assert list(sa_type.enums) == column.categories + + def test_enum_sqlalchemy_native_string_categories_use_column_name() -> None: class TestSchema(dy.Schema): status = dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True) From ba13effdbb059870047bb022c702fa55073f1ece Mon Sep 17 00:00:00 2001 From: Jack Oberman Date: Wed, 1 Jul 2026 08:41:29 -0400 Subject: [PATCH 03/12] refactor: Address PR review comments for sqlalchemy_use_enum Co-Authored-By: Claude Sonnet 4.6 --- dataframely/columns/enum.py | 60 +++++++++++++++----------- docs/guides/features/sql-generation.md | 20 ++++++--- tests/column_types/test_enum.py | 17 ++++++++ 3 files changed, 66 insertions(+), 31 deletions(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index f1d2699..74bf2b0 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -91,6 +91,10 @@ def __init__( metadata=metadata, description=description, ) + if sqlalchemy_enum_name and not sqlalchemy_use_enum: + raise ValueError( + "sqlalchemy_enum_name has no effect when sqlalchemy_use_enum=False." + ) self.sqlalchemy_use_enum = sqlalchemy_use_enum self.sqlalchemy_enum_name = sqlalchemy_enum_name self._enum_class: type[enum.Enum] | None = None @@ -108,13 +112,6 @@ def validate_dtype(self, dtype: PolarsDataType) -> bool: return False return self.categories == dtype.categories.to_list() - def sqlalchemy_column(self, name: str, dialect: sa.Dialect) -> sa.Column: - if self.sqlalchemy_use_enum: - column = super().sqlalchemy_column(name, dialect) - column.type = self._sqlalchemy_enum_type(dialect, column_name=name) - return column - return super().sqlalchemy_column(name, dialect) - def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: if self.sqlalchemy_use_enum: column_name = self._name or None @@ -127,25 +124,36 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: def _sqlalchemy_enum_type( self, _dialect: sa.Dialect, *, column_name: str | None ) -> sa_TypeEngine: - length = max(len(c) for c in self.categories) - kwargs: dict[str, Any] = {"length": length} - name = self.sqlalchemy_enum_name - if self._enum_class is not None: - if name is not None: - kwargs["name"] = name - kwargs["values_callable"] = lambda enum: [member.value for member in enum] - return sa.Enum(self._enum_class, **kwargs) - if name is None: - name = column_name - if name is None: - raise ValueError( - "sqlalchemy_enum_name is required for dy.Enum with string categories " - "and sqlalchemy_use_enum=True when not building columns via " - "Schema.to_sqlalchemy_columns(). Alternatively, pass a Python " - "enum.Enum class as categories." - ) - kwargs["name"] = name - return sa.Enum(*self.categories, **kwargs) + match self._enum_class: + case None: + # Enum built from inputting string-categories: requires an + # explicit name (from sqlalchemy_enum_name or the SQL column + # name set by Schema.to_sqlalchemy_columns). + name = self.sqlalchemy_enum_name or column_name + if name is None: + raise ValueError( + "sqlalchemy_enum_name is required for dy.Enum with string " + "categories and sqlalchemy_use_enum=True when not building " + "columns via Schema.to_sqlalchemy_columns(). Alternatively, " + "pass a Python enum.Enum class as categories." + ) + return sa.Enum(*self.categories, name=name) + case enum_class: + # dy.Enum was constructed from a Python enum.Enum class. + # Persist .value strings (not member names) to stay consistent + # with how dy.Enum stores self.categories. + # Omit name entirely when unset — passing name=None suppresses + # SQLAlchemy's default of using the class name (lowercased). + name_kwargs: dict[str, str] = ( + {"name": self.sqlalchemy_enum_name} + if self.sqlalchemy_enum_name is not None + else {} + ) + return sa.Enum( + enum_class, + values_callable=lambda e: [m.value for m in e], + **name_kwargs, + ) @property def pyarrow_dtype(self) -> pa.DataType: diff --git a/docs/guides/features/sql-generation.md b/docs/guides/features/sql-generation.md index 696e01e..4b90fdd 100644 --- a/docs/guides/features/sql-generation.md +++ b/docs/guides/features/sql-generation.md @@ -83,24 +83,34 @@ maximal lengths can be particularly important for primary key columns. Some data ## Native SQL enums (optional) -By default, {class}`~dataframely.Enum` maps to fixed-length `CHAR` or `VARCHAR` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), set `sqlalchemy_use_enum=True`: +By default, {class}`~dataframely.Enum` maps to `sa.CHAR` or `sa.String` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), set `sqlalchemy_use_enum=True`: ```python -from enum import Enum +from enum import Enum, auto +import sqlalchemy as sa import dataframely as dy +from sqlalchemy.dialects.postgresql import dialect as pg_dialect +from sqlalchemy.dialects.mssql import dialect as mssql_dialect class Status(str, Enum): - PENDING = "pending" - APPROVED = "approved" + PENDING = auto() + APPROVED = auto() class Staged(dy.Schema): status = dy.Enum(Status, sqlalchemy_use_enum=True) + + +pg_cols = Staged.to_sqlalchemy_columns(pg_dialect()()) +# pg_cols[0].type is sa.Enum — compiles to: "status" (PostgreSQL native ENUM) + +mssql_cols = Staged.to_sqlalchemy_columns(mssql_dialect()()) +# mssql_cols[0].type falls back to VARCHAR with a CHECK constraint ``` -When `categories` is a Python `enum.Enum` subclass, SQLAlchemy uses the enum class name (lowercased) as the database enum type name. For string category lists, the SQL column name is used by default; override it with `sqlalchemy_enum_name` if needed. On dialects without native enums (such as Microsoft SQL Server), SQLAlchemy falls back to `VARCHAR` with a check constraint. +When `categories` is a Python `enum.Enum` subclass, SQLAlchemy uses the enum class name (lowercased) as the database enum type name. For string category lists, the SQL column name is used by default; override it with `sqlalchemy_enum_name` if needed. On dialects without native enums (such as Microsoft SQL Server), SQLAlchemy falls back to `sa.String` with a check constraint. ## Collections of multiple tables diff --git a/tests/column_types/test_enum.py b/tests/column_types/test_enum.py index 9de0103..fb74409 100644 --- a/tests/column_types/test_enum.py +++ b/tests/column_types/test_enum.py @@ -123,6 +123,18 @@ def test_matches_sqlalchemy_use_enum() -> None: def test_matches_sqlalchemy_enum_name() -> None: expr = pl.element() + assert dy.Enum( + ["a", "b"], + sqlalchemy_use_enum=True, + sqlalchemy_enum_name="one", + ).matches( + dy.Enum( + ["a", "b"], + sqlalchemy_use_enum=True, + sqlalchemy_enum_name="one", + ), + expr, + ) assert not dy.Enum( ["a", "b"], sqlalchemy_use_enum=True, @@ -137,6 +149,11 @@ def test_matches_sqlalchemy_enum_name() -> None: ) +def test_sqlalchemy_enum_name_without_use_enum_raises() -> None: + with pytest.raises(ValueError, match="sqlalchemy_enum_name has no effect"): + dy.Enum(["a", "b"], sqlalchemy_enum_name="my_enum") + + def test_as_dict_from_dict_sqlalchemy_enum_flags() -> None: column = dy.Enum( ["a", "b"], From 170502af48a4220c0bd661af25e0568b9ecebc1d Mon Sep 17 00:00:00 2001 From: Andreas Albert <103571926+AndreasAlbertQC@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:16:32 +0200 Subject: [PATCH 04/12] Apply suggestions from code review Co-authored-by: Andreas Albert <103571926+AndreasAlbertQC@users.noreply.github.com> --- docs/guides/features/sql-generation.md | 2 +- tests/columns/test_sqlalchemy_columns.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/guides/features/sql-generation.md b/docs/guides/features/sql-generation.md index 4b90fdd..6099b67 100644 --- a/docs/guides/features/sql-generation.md +++ b/docs/guides/features/sql-generation.md @@ -83,7 +83,7 @@ maximal lengths can be particularly important for primary key columns. Some data ## Native SQL enums (optional) -By default, {class}`~dataframely.Enum` maps to `sa.CHAR` or `sa.String` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), set `sqlalchemy_use_enum=True`: +By default, {class}`~dataframely.Enum` maps to `sa.CHAR` or `sa.String` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), you may set `sqlalchemy_use_enum=True` to instead generate native enums: ```python from enum import Enum, auto diff --git a/tests/columns/test_sqlalchemy_columns.py b/tests/columns/test_sqlalchemy_columns.py index 9cd8fcf..532cee5 100644 --- a/tests/columns/test_sqlalchemy_columns.py +++ b/tests/columns/test_sqlalchemy_columns.py @@ -1,7 +1,7 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause -from enum import Enum +from enum import StrEnum import pytest @@ -175,7 +175,7 @@ def test_raise_for_object_column(dialect: Dialect) -> None: dy.Object().sqlalchemy_dtype(dialect) -class _Status(str, Enum): +class _Status(StrEnum): PENDING = "pending" APPROVED = "approved" From 1e9cc981c874d7e1478f6f4710e95f9d2c829766 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 1 Jul 2026 15:23:41 +0200 Subject: [PATCH 05/12] fix --- tests/columns/test_sqlalchemy_columns.py | 27 +++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/columns/test_sqlalchemy_columns.py b/tests/columns/test_sqlalchemy_columns.py index 532cee5..c97d114 100644 --- a/tests/columns/test_sqlalchemy_columns.py +++ b/tests/columns/test_sqlalchemy_columns.py @@ -1,12 +1,13 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause -from enum import StrEnum +from enum import Enum +from typing import cast import pytest import dataframely as dy -from dataframely._compat import Dialect, MSDialect_pyodbc, PGDialect_psycopg2 +from dataframely._compat import Dialect, MSDialect_pyodbc, PGDialect_psycopg2, sa from dataframely.columns import Column from dataframely.testing import COLUMN_TYPES, create_schema @@ -175,7 +176,7 @@ def test_raise_for_object_column(dialect: Dialect) -> None: dy.Object().sqlalchemy_dtype(dialect) -class _Status(StrEnum): +class _Status(str, Enum): PENDING = "pending" APPROVED = "approved" @@ -205,7 +206,9 @@ class _Status(StrEnum): ), ], ) -def test_enum_sqlalchemy_native(column: Column, dialect: Dialect, datatype: str) -> None: +def test_enum_sqlalchemy_native( + column: Column, dialect: Dialect, datatype: str +) -> None: schema = create_schema("test", {"a": column}) columns = schema.to_sqlalchemy_columns(dialect) assert len(columns) == 1 @@ -215,7 +218,9 @@ def test_enum_sqlalchemy_native(column: Column, dialect: Dialect, datatype: str) def test_enum_sqlalchemy_native_python_enum_uses_member_values() -> None: column = dy.Enum(_Status, sqlalchemy_use_enum=True) schema = create_schema("test", {"a": column}) - sa_type = schema.to_sqlalchemy_columns(PGDialect_psycopg2())[0].type + sa_type = cast( + sa.sql.sqltypes.Enum, schema.to_sqlalchemy_columns(PGDialect_psycopg2())[0].type + ) assert list(sa_type.enums) == column.categories @@ -224,13 +229,15 @@ class TestSchema(dy.Schema): status = dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True) column = TestSchema.columns()["status"] - assert column.sqlalchemy_dtype(PGDialect_psycopg2()).compile( - PGDialect_psycopg2() - ) == "status" + assert ( + column.sqlalchemy_dtype(PGDialect_psycopg2()).compile(PGDialect_psycopg2()) + == "status" + ) -def test_enum_sqlalchemy_native_string_categories_requires_name_without_column( -) -> None: +def test_enum_sqlalchemy_native_string_categories_requires_name_without_column() -> ( + None +): column = dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True) with pytest.raises(ValueError, match="sqlalchemy_enum_name is required"): column.sqlalchemy_dtype(PGDialect_psycopg2()) From 3b4cbd67a2aceba213b4b9119369cac0a4b90d8a Mon Sep 17 00:00:00 2001 From: Andreas Albert <103571926+AndreasAlbertQC@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:46:56 +0200 Subject: [PATCH 06/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- dataframely/columns/enum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index 74bf2b0..f490378 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -73,7 +73,7 @@ def __init__( sqlalchemy_use_enum: When ``True``, map this column to :class:`sqlalchemy.Enum` in :meth:`~dataframely.Schema.to_sqlalchemy_columns` instead of ``CHAR`` / ``VARCHAR``. Use this for PostgreSQL native enum types and - Alembic schema drift detection. Defaults to ``False`` (string columns). + Alembic schema drift detection. sqlalchemy_enum_name: Optional name for the SQLAlchemy / database enum type when ``sqlalchemy_use_enum=True``. If omitted and ``categories`` is a Python :class:`enum.Enum` subclass, SQLAlchemy uses the enum class name From 35c6763fa4e3b52631147972262220508600839c Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 1 Jul 2026 16:14:02 +0200 Subject: [PATCH 07/12] docs --- docs/guides/features/sql-generation.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/guides/features/sql-generation.md b/docs/guides/features/sql-generation.md index 6099b67..d186eb6 100644 --- a/docs/guides/features/sql-generation.md +++ b/docs/guides/features/sql-generation.md @@ -81,9 +81,9 @@ the maximal length of the string is inferred from the regular expression if poss maximal lengths can be particularly important for primary key columns. Some database systems, such as Microsoft SQL Server, do not allow `VARCHAR(max)` columns (unbounded strings) to be used as primary keys. ``` -## Native SQL enums (optional) +## Native SQL enums -By default, {class}`~dataframely.Enum` maps to `sa.CHAR` or `sa.String` columns so stored values remain plain strings. For PostgreSQL setups that use database-level `ENUM` types (for example with Alembic autogenerate), you may set `sqlalchemy_use_enum=True` to instead generate native enums: +By default, {class}`~dataframely.Enum` maps to `sa.CHAR` or `sa.String` columns so stored values remain plain strings. You may set `sqlalchemy_use_enum=True` to instead generate native enums: ```python from enum import Enum, auto @@ -101,16 +101,20 @@ class Status(str, Enum): class Staged(dy.Schema): status = dy.Enum(Status, sqlalchemy_use_enum=True) +``` +This will translate the `~dataframely.Enum` to a `~sqlalchemy.Enum`: -pg_cols = Staged.to_sqlalchemy_columns(pg_dialect()()) -# pg_cols[0].type is sa.Enum — compiles to: "status" (PostgreSQL native ENUM) - -mssql_cols = Staged.to_sqlalchemy_columns(mssql_dialect()()) -# mssql_cols[0].type falls back to VARCHAR with a CHECK constraint +```python +>>> Staged.to_sqlalchemy_columns(pg_dialect()) +[Column('status', Enum('1', '2', name='status'), table=None, nullable=False)] ``` -When `categories` is a Python `enum.Enum` subclass, SQLAlchemy uses the enum class name (lowercased) as the database enum type name. For string category lists, the SQL column name is used by default; override it with `sqlalchemy_enum_name` if needed. On dialects without native enums (such as Microsoft SQL Server), SQLAlchemy falls back to `sa.String` with a check constraint. +Depending on the database dialect you use, `sqlalchemy` will render this accordingly. +For example, `postgresql` supports native enums, and `sqlalchemy` will create a native enum column, while in MSSQL, where this is not supported, it will fall back to `VARCHAR`. + +When `categories` is a Python `enum.Enum` subclass, `sqlalchemy` uses the enum class name (lowercased) as the database enum type name. +For string category lists, the SQL column name is used by default; override it with `sqlalchemy_enum_name` if needed. ## Collections of multiple tables From c02ac19f4c1856c09e78cbbd2effda00e4520d8c Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 1 Jul 2026 16:47:07 +0200 Subject: [PATCH 08/12] refactor --- dataframely/columns/enum.py | 52 +++++++----------------- tests/column_types/test_enum.py | 9 ++++ tests/columns/test_sqlalchemy_columns.py | 8 ---- 3 files changed, 23 insertions(+), 46 deletions(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index f490378..ed201a3 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -97,10 +97,19 @@ def __init__( ) self.sqlalchemy_use_enum = sqlalchemy_use_enum self.sqlalchemy_enum_name = sqlalchemy_enum_name - self._enum_class: type[enum.Enum] | None = None if isclass(categories) and issubclass(categories, enum.Enum): - self._enum_class = categories + # If the user passed an Enum type, we want to determine a default name + # based on the Enum class name, which is also what sqlalchemy does. + # One could instead keep a reference to the Enum class around and pass it + # to sqlalchemy later on, but that will interfere with the base-class implementations + # of `matches` and `to_dict` / `from_dict`. + if self.sqlalchemy_use_enum: + self.sqlalchemy_enum_name = ( + self.sqlalchemy_enum_name or categories.__name__.lower() + ) + categories = (item.value for item in categories) + self.categories = list(categories) @property @@ -114,47 +123,14 @@ def validate_dtype(self, dtype: PolarsDataType) -> bool: def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: if self.sqlalchemy_use_enum: - column_name = self._name or None - return self._sqlalchemy_enum_type(dialect, column_name=column_name) + return sa.Enum( + *self.categories, name=self.sqlalchemy_enum_name or self._name + ) category_lengths = [len(c) for c in self.categories] if all(length == category_lengths[0] for length in category_lengths): return sa.CHAR(category_lengths[0]) return sa.String(max(category_lengths)) - def _sqlalchemy_enum_type( - self, _dialect: sa.Dialect, *, column_name: str | None - ) -> sa_TypeEngine: - match self._enum_class: - case None: - # Enum built from inputting string-categories: requires an - # explicit name (from sqlalchemy_enum_name or the SQL column - # name set by Schema.to_sqlalchemy_columns). - name = self.sqlalchemy_enum_name or column_name - if name is None: - raise ValueError( - "sqlalchemy_enum_name is required for dy.Enum with string " - "categories and sqlalchemy_use_enum=True when not building " - "columns via Schema.to_sqlalchemy_columns(). Alternatively, " - "pass a Python enum.Enum class as categories." - ) - return sa.Enum(*self.categories, name=name) - case enum_class: - # dy.Enum was constructed from a Python enum.Enum class. - # Persist .value strings (not member names) to stay consistent - # with how dy.Enum stores self.categories. - # Omit name entirely when unset — passing name=None suppresses - # SQLAlchemy's default of using the class name (lowercased). - name_kwargs: dict[str, str] = ( - {"name": self.sqlalchemy_enum_name} - if self.sqlalchemy_enum_name is not None - else {} - ) - return sa.Enum( - enum_class, - values_callable=lambda e: [m.value for m in e], - **name_kwargs, - ) - @property def pyarrow_dtype(self) -> pa.DataType: if len(self.categories) <= 2**8 - 1: diff --git a/tests/column_types/test_enum.py b/tests/column_types/test_enum.py index fb74409..5ba133b 100644 --- a/tests/column_types/test_enum.py +++ b/tests/column_types/test_enum.py @@ -121,6 +121,15 @@ def test_matches_sqlalchemy_use_enum() -> None: ) +def test_matches_sqlalchemy_use_enum_fails_on_internal_name_mismatch() -> None: + class MyEnum(str, Enum): + x = "x" + + assert not dy.Enum(MyEnum, sqlalchemy_use_enum=True).matches( + dy.Enum(["x"], sqlalchemy_use_enum=True), pl.element() + ) + + def test_matches_sqlalchemy_enum_name() -> None: expr = pl.element() assert dy.Enum( diff --git a/tests/columns/test_sqlalchemy_columns.py b/tests/columns/test_sqlalchemy_columns.py index c97d114..3a53ce8 100644 --- a/tests/columns/test_sqlalchemy_columns.py +++ b/tests/columns/test_sqlalchemy_columns.py @@ -233,11 +233,3 @@ class TestSchema(dy.Schema): column.sqlalchemy_dtype(PGDialect_psycopg2()).compile(PGDialect_psycopg2()) == "status" ) - - -def test_enum_sqlalchemy_native_string_categories_requires_name_without_column() -> ( - None -): - column = dy.Enum(["foo", "bar"], sqlalchemy_use_enum=True) - with pytest.raises(ValueError, match="sqlalchemy_enum_name is required"): - column.sqlalchemy_dtype(PGDialect_psycopg2()) From 67987a197e16b0f0e9fceffd4d8f94e762e6f1b1 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 1 Jul 2026 16:48:19 +0200 Subject: [PATCH 09/12] doc --- dataframely/columns/enum.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index ed201a3..d3156c4 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -72,8 +72,7 @@ def __init__( description: A human-readable description of the column. sqlalchemy_use_enum: When ``True``, map this column to :class:`sqlalchemy.Enum` in :meth:`~dataframely.Schema.to_sqlalchemy_columns` instead of - ``CHAR`` / ``VARCHAR``. Use this for PostgreSQL native enum types and - Alembic schema drift detection. + ``CHAR`` / ``VARCHAR``. sqlalchemy_enum_name: Optional name for the SQLAlchemy / database enum type when ``sqlalchemy_use_enum=True``. If omitted and ``categories`` is a Python :class:`enum.Enum` subclass, SQLAlchemy uses the enum class name @@ -93,8 +92,9 @@ def __init__( ) if sqlalchemy_enum_name and not sqlalchemy_use_enum: raise ValueError( - "sqlalchemy_enum_name has no effect when sqlalchemy_use_enum=False." + "`sqlalchemy_enum_name` has no effect when `sqlalchemy_use_enum=False`." ) + self.sqlalchemy_use_enum = sqlalchemy_use_enum self.sqlalchemy_enum_name = sqlalchemy_enum_name if isclass(categories) and issubclass(categories, enum.Enum): @@ -110,6 +110,8 @@ def __init__( categories = (item.value for item in categories) + if self.sqlalchemy_use_enum and not self.sqlalchemy_enum_name: + raise ValueError("`sqlalchemy_enum_name` is required when ") self.categories = list(categories) @property From 7450eac7b0a2ee4cffba4ab54925f5e6d2a333f1 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 1 Jul 2026 16:52:28 +0200 Subject: [PATCH 10/12] doc --- dataframely/columns/enum.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index d3156c4..53dd7cc 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -75,10 +75,9 @@ def __init__( ``CHAR`` / ``VARCHAR``. sqlalchemy_enum_name: Optional name for the SQLAlchemy / database enum type when ``sqlalchemy_use_enum=True``. If omitted and ``categories`` is a - Python :class:`enum.Enum` subclass, SQLAlchemy uses the enum class name - (lowercased). Otherwise the SQL column name from - :meth:`~dataframely.Schema.to_sqlalchemy_columns` is used. For Python - enums, persisted values are the enum members' ``.value`` strings (not + Python :class:`enum.Enum` subclass, the lowercased enum class is used. + Otherwise, the name of the column is used. + The ersisted values are the enum members' ``.value`` strings (not member names), matching :attr:`categories`. """ super().__init__( From 753d12d54d0c83a246d591c3ab2b9e8888da4aff Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 1 Jul 2026 17:00:30 +0200 Subject: [PATCH 11/12] fix --- dataframely/columns/enum.py | 2 -- tests/column_types/test_enum.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index 53dd7cc..d93ef44 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -109,8 +109,6 @@ def __init__( categories = (item.value for item in categories) - if self.sqlalchemy_use_enum and not self.sqlalchemy_enum_name: - raise ValueError("`sqlalchemy_enum_name` is required when ") self.categories = list(categories) @property diff --git a/tests/column_types/test_enum.py b/tests/column_types/test_enum.py index 5ba133b..0fd9e6e 100644 --- a/tests/column_types/test_enum.py +++ b/tests/column_types/test_enum.py @@ -159,7 +159,7 @@ def test_matches_sqlalchemy_enum_name() -> None: def test_sqlalchemy_enum_name_without_use_enum_raises() -> None: - with pytest.raises(ValueError, match="sqlalchemy_enum_name has no effect"): + with pytest.raises(ValueError, match="`sqlalchemy_enum_name` has no effect"): dy.Enum(["a", "b"], sqlalchemy_enum_name="my_enum") From ad714051fc70f2ba01ab490a28354d30ce00f45d Mon Sep 17 00:00:00 2001 From: Jack Oberman Date: Wed, 1 Jul 2026 11:11:23 -0400 Subject: [PATCH 12/12] spell fix --- dataframely/columns/enum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index d93ef44..e6f7336 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -77,7 +77,7 @@ def __init__( when ``sqlalchemy_use_enum=True``. If omitted and ``categories`` is a Python :class:`enum.Enum` subclass, the lowercased enum class is used. Otherwise, the name of the column is used. - The ersisted values are the enum members' ``.value`` strings (not + The persisted values are the enum members' ``.value`` strings (not member names), matching :attr:`categories`. """ super().__init__(