table_bases

sc_crawler.table_bases #

Tiny helper classes for the most commonly used fields to be inherited by sc_crawler.tables.

ScMetaModel #

Bases: __class__

Custom class factory to auto-update table models.

Reuse description of the table and its fields as SQL comment.

Checking if the table and its fields have explicit comment set to be shown in the CREATE TABLE statements, and if not, reuse the optional table and field descriptions. Table docstrings are truncated to first line.
Reuse description of the fields to dynamically append to the docstring in the Attributes section.
Set __validator__ to the parent Pydantic model without table=True, which is useful for running validations. The Pydantic model is found by the parent class' name ending in "Base".
Auto-generate SCD table docs from the non-SCD table docs.

Source code in sc_crawler/table_bases.py

class ScMetaModel(SQLModel.__class__):
    """Custom class factory to auto-update table models.

    - Reuse description of the table and its fields as SQL comment.

        Checking if the table and its fields have explicit comment set
        to be shown in the `CREATE TABLE` statements, and if not,
        reuse the optional table and field descriptions. Table
        docstrings are truncated to first line.

    - Reuse description of the fields to dynamically append to the
        docstring in the Attributes section.

    - Set `__validator__` to the parent Pydantic model without
        `table=True`, which is useful for running validations.
        The Pydantic model is found by the parent class' name ending in "Base".

    - Auto-generate SCD table docs from the non-SCD table docs.
    """

    def __init__(subclass, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # early return for non-tables
        if subclass.model_config.get("table") is None:
            return
        satable = subclass.metadata.tables[subclass.__tablename__]

        # enforce auto-naming constrains as per
        # https://alembic.sqlalchemy.org/en/latest/naming.html
        subclass.metadata.naming_convention = {
            "ix": "ix_%(column_0_label)s",
            "uq": "uq_%(table_name)s_%(column_0_name)s",
            "ck": "ck_%(table_name)s_%(constraint_name)s",
            "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
            "pk": "pk_%(table_name)s",
        }

        # table comment
        if subclass.__doc__ and satable.comment is None:
            satable.comment = subclass.__doc__.splitlines()[0]

        # column comments
        for k, v in subclass.model_fields.items():
            comment = satable.columns[k].comment
            if v.description and comment is None:
                satable.columns[k].comment = v.description

        # generate docstring for SCD tables
        if subclass.__name__.endswith("Scd"):
            from .tables import tables

            nonscd = [t for t in tables if t.__name__ == subclass.__name__[:-3]][0]
            doclines = nonscd.__doc__.splitlines()
            # drop trailing dot and append SCD
            doclines[0] = doclines[0][:-1] + " (SCD Type 2)."
            subclass.__doc__ = "\n".join(doclines)
        else:
            # describe table columns as attributes in docstring
            subclass.__doc__ = subclass.__doc__ + "\n\nAttributes:\n"
            for k, v in subclass.model_fields.items():
                if not hasattr(v.annotation, "__args__"):
                    typehint = v.annotation.__name__
                else:
                    typehint = str(v.annotation)
                description = satable.columns[k].comment
                subclass.__doc__ = (
                    subclass.__doc__ + f"    {k} ({typehint}): {description}\n"
                )

        # find Pydantic model parent to be used for validating
        subclass.__validator__ = [
            m for m in subclass.__bases__ if m.__name__.endswith("Base")
        ][0]

ScModel #

Bases: SQLModel

Custom extensions to SQLModel objects and tables.

Extra features:

auto-generated table names using snake_case,
support for hashing table rows,
reuse description field of tables/columns as SQL comment,
reuse description field of columns to extend the Attributes section of the docstring.

Source code in sc_crawler/table_bases.py

class ScModel(SQLModel, metaclass=ScMetaModel):
    """Custom extensions to SQLModel objects and tables.

    Extra features:

    - auto-generated table names using [snake_case][sc_crawler.str_utils.snake_case],
    - support for hashing table rows,
    - reuse description field of tables/columns as SQL comment,
    - reuse description field of columns to extend the `Attributes` section of the docstring.
    """

    @declared_attr  # type: ignore
    def __tablename__(cls) -> str:
        """Override tables names using all-lowercase [snake_case][sc_crawler.str_utils.snake_case]."""
        return snake_case(cls.__name__)

    @classmethod
    def get_columns(cls) -> List[str]:
        """Return the table's column names in a dict for all, primary keys, and attributes."""
        columns = cls.__table__.columns.keys()
        pks = [pk.name for pk in inspect(cls).primary_key]
        attributes = [a for a in columns if a not in set(pks)]
        return {"all": columns, "primary_keys": pks, "attributes": attributes}

    @classmethod
    def get_table_name(cls) -> str:
        """Return the SQLModel object's table name."""
        return str(cls.__tablename__)

    @classmethod
    def get_validator(cls) -> Union["ScModel", None]:
        """Return the parent Base Pydantic model (without a table definition)."""
        if cls.model_config.get("table") is None:
            return None
        return cls.__validator__

    @classmethod
    def get_scd(cls) -> Union["ScModel", None]:
        """Return the SCD version of the SQLModel table."""
        if cls.model_config.get("table") is None:
            return None
        from .tables_scd import tables_scd

        validator = cls.get_validator()
        scds = [t for t in tables_scd if t.get_validator() == validator]
        if len(scds) != 1:
            raise ValueError("Not found SCD definition.")
        return scds[0]

    @classmethod
    def hash(
        cls,
        session: Session,
        ignored: List[str] = ["observed_at"],
        progress: Optional[Progress] = None,
    ) -> dict:
        """Hash the content of the rows.

        Args:
            session: Database connection to use for object lookups.
            ignored: List of column names to exclude from hashing.
            progress: Optional progress bar to track the status of the hashing.

        Returns:
            Dictionary of the row hashes keyed by the JSON dump of primary keys.
        """
        pks = sorted(cls.get_columns()["primary_keys"])
        rows = session.exec(statement=select(cls))
        row_count = session.query(cls).count()
        if progress:
            table_task_id = progress.add_task(
                cls.get_table_name(),
                total=row_count,
            )
        # no use of a generator as will need to serialize to JSON anyway
        hashes = {}
        for i, row in enumerate(rows):
            # NOTE Pydantic is warning when read Gpu/Storage as dict
            # https://github.com/tiangolo/sqlmodel/issues/63#issuecomment-1081555082
            rowdict = row.model_dump(warnings=False)
            keys = {pk: rowdict.get(pk) for pk in pks}
            keys_id = dumps(keys, sort_keys=True)
            for dropkey in [*ignored, *pks]:
                rowdict.pop(dropkey, None)
            rowhash = sha1(dumps(rowdict, sort_keys=True).encode()).hexdigest()
            hashes[keys_id] = rowhash
            if progress:
                # updating the progress bar is expensive, so limit with manu iterations
                if row_count > 1e3:
                    if (i + 1) % 1000 == 0:
                        progress.update(table_task_id, advance=1000)
                    if i == row_count - 1:
                        progress.update(table_task_id, advance=row_count % 1000)
                else:
                    progress.update(table_task_id, advance=1)

        return hashes

tablename #

__tablename__()

Override tables names using all-lowercase snake_case.

Source code in sc_crawler/table_bases.py

@declared_attr  # type: ignore
def __tablename__(cls) -> str:
    """Override tables names using all-lowercase [snake_case][sc_crawler.str_utils.snake_case]."""
    return snake_case(cls.__name__)

get_columns `classmethod` #

get_columns()

Return the table's column names in a dict for all, primary keys, and attributes.

Source code in sc_crawler/table_bases.py

@classmethod
def get_columns(cls) -> List[str]:
    """Return the table's column names in a dict for all, primary keys, and attributes."""
    columns = cls.__table__.columns.keys()
    pks = [pk.name for pk in inspect(cls).primary_key]
    attributes = [a for a in columns if a not in set(pks)]
    return {"all": columns, "primary_keys": pks, "attributes": attributes}

get_table_name `classmethod` #

get_table_name()

Return the SQLModel object's table name.

Source code in sc_crawler/table_bases.py

@classmethod
def get_table_name(cls) -> str:
    """Return the SQLModel object's table name."""
    return str(cls.__tablename__)

get_validator `classmethod` #

get_validator()

Return the parent Base Pydantic model (without a table definition).

Source code in sc_crawler/table_bases.py

@classmethod
def get_validator(cls) -> Union["ScModel", None]:
    """Return the parent Base Pydantic model (without a table definition)."""
    if cls.model_config.get("table") is None:
        return None
    return cls.__validator__

get_scd `classmethod` #

get_scd()

Return the SCD version of the SQLModel table.

Source code in sc_crawler/table_bases.py

@classmethod
def get_scd(cls) -> Union["ScModel", None]:
    """Return the SCD version of the SQLModel table."""
    if cls.model_config.get("table") is None:
        return None
    from .tables_scd import tables_scd

    validator = cls.get_validator()
    scds = [t for t in tables_scd if t.get_validator() == validator]
    if len(scds) != 1:
        raise ValueError("Not found SCD definition.")
    return scds[0]

hash `classmethod` #

hash(session, ignored=['observed_at'], progress=None)

Hash the content of the rows.

Parameters:

Name	Type	Description	Default
`session`	`Session`	Database connection to use for object lookups.	required
`ignored`	`List[str]`	List of column names to exclude from hashing.	`['observed_at']`
`progress`	`Optional[Progress]`	Optional progress bar to track the status of the hashing.	`None`

Returns:

Type	Description
`dict`	Dictionary of the row hashes keyed by the JSON dump of primary keys.

Source code in sc_crawler/table_bases.py

@classmethod
def hash(
    cls,
    session: Session,
    ignored: List[str] = ["observed_at"],
    progress: Optional[Progress] = None,
) -> dict:
    """Hash the content of the rows.

    Args:
        session: Database connection to use for object lookups.
        ignored: List of column names to exclude from hashing.
        progress: Optional progress bar to track the status of the hashing.

    Returns:
        Dictionary of the row hashes keyed by the JSON dump of primary keys.
    """
    pks = sorted(cls.get_columns()["primary_keys"])
    rows = session.exec(statement=select(cls))
    row_count = session.query(cls).count()
    if progress:
        table_task_id = progress.add_task(
            cls.get_table_name(),
            total=row_count,
        )
    # no use of a generator as will need to serialize to JSON anyway
    hashes = {}
    for i, row in enumerate(rows):
        # NOTE Pydantic is warning when read Gpu/Storage as dict
        # https://github.com/tiangolo/sqlmodel/issues/63#issuecomment-1081555082
        rowdict = row.model_dump(warnings=False)
        keys = {pk: rowdict.get(pk) for pk in pks}
        keys_id = dumps(keys, sort_keys=True)
        for dropkey in [*ignored, *pks]:
            rowdict.pop(dropkey, None)
        rowhash = sha1(dumps(rowdict, sort_keys=True).encode()).hexdigest()
        hashes[keys_id] = rowhash
        if progress:
            # updating the progress bar is expensive, so limit with manu iterations
            if row_count > 1e3:
                if (i + 1) % 1000 == 0:
                    progress.update(table_task_id, advance=1000)
                if i == row_count - 1:
                    progress.update(table_task_id, advance=row_count % 1000)
            else:
                progress.update(table_task_id, advance=1)

    return hashes

MetaColumns #

Bases: ScModel

Helper class to add the status and observed_at columns.

Source code in sc_crawler/table_bases.py

class MetaColumns(ScModel):
    """Helper class to add the `status` and `observed_at` columns."""

    status: Status = Field(
        default=Status.ACTIVE,
        description="Status of the resource (active or inactive).",
    )
    observed_at: datetime = Field(
        default_factory=datetime.utcnow,
        sa_column_kwargs={"onupdate": datetime.utcnow},
        description="Timestamp of the last observation.",
    )

BenchmarkScoreFields #

Bases: HasBenchmarkPKFK, HasServerPK, HasVendorPKFK

Source code in sc_crawler/table_bases.py

class BenchmarkScoreFields(HasBenchmarkPKFK, HasServerPK, HasVendorPKFK):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    @model_validator(mode="before")
    def update_config_to_hashable(cls, values):
        """We need a hashable column for the primary key.

        Note that we also sort the keys, so that the resulting JSON
        can be compared as text as well (as some database engines do).
        """
        values["config"] = HashableDict(sorted(values.get("config", {}).items()))
        return values

    # use HashableDict as it's a primary key that needs to be hashable, but
    # fall back to dict to avoid PydanticInvalidForJsonSchema
    config: HashableDict | dict = Field(
        default={},
        sa_type=HashableJSON,
        primary_key=True,
        description='Dictionary of config parameters of the specific benchmark, e.g. {"bandwidth": 4096}',
    )
    score: float = Field(
        description="The resulting score of the benchmark.",
    )
    note: Optional[str] = Field(
        default=None,
        description="Optional note, comment or context on the benchmark score.",
    )

update_config_to_hashable #

update_config_to_hashable(values)

We need a hashable column for the primary key.

Note that we also sort the keys, so that the resulting JSON can be compared as text as well (as some database engines do).

Source code in sc_crawler/table_bases.py

@model_validator(mode="before")
def update_config_to_hashable(cls, values):
    """We need a hashable column for the primary key.

    Note that we also sort the keys, so that the resulting JSON
    can be compared as text as well (as some database engines do).
    """
    values["config"] = HashableDict(sorted(values.get("config", {}).items()))
    return values

table_bases

sc_crawler.table_bases #

ScMetaModel #

ScModel #

__tablename__ #

get_columns classmethod #

get_table_name classmethod #

get_validator classmethod #

get_scd classmethod #

hash classmethod #

MetaColumns #

BenchmarkScoreFields #

update_config_to_hashable #

tablename #

get_columns `classmethod` #

get_table_name `classmethod` #

get_validator `classmethod` #

get_scd `classmethod` #

hash `classmethod` #