Skip to content

utils

sc_crawler.utils #

jsoned_hash #

jsoned_hash(*args, **kwargs)

Hash the JSON-dump of all positional and keyword arguments.

Examples:

>>> jsoned_hash(42)
'0211c62419aece235ba19582d3cf7fd8e25f837c'
>>> jsoned_hash(everything=42)
'8f8a7fcade8cb632b856f46fc64c1725ee387617'
>>> jsoned_hash(42, 42, everything=42)
'f04a77f000d85929b13de04b436c60a1272dfbf5'
Source code in sc_crawler/utils.py
def jsoned_hash(*args, **kwargs):
    """Hash the JSON-dump of all positional and keyword arguments.

    Examples:
        >>> jsoned_hash(42)
        '0211c62419aece235ba19582d3cf7fd8e25f837c'
        >>> jsoned_hash(everything=42)
        '8f8a7fcade8cb632b856f46fc64c1725ee387617'
        >>> jsoned_hash(42, 42, everything=42)
        'f04a77f000d85929b13de04b436c60a1272dfbf5'
    """
    return sha1(
        dumps({"args": args, "kwargs": kwargs}, sort_keys=True).encode()
    ).hexdigest()

hash_database #

hash_database(connection_string, level=HashLevels.DATABASE, ignored=['observed_at'], progress=None, exclude_tables=[])

Hash the content of a database.

Parameters:

Name Type Description Default
connection_string str

SQLAlchemy connection string to connect to the database.

required
level HashLevels

The level at which to apply hashing. Possible values are 'DATABASE' (default), 'TABLE', or 'ROW'.

DATABASE
ignored List[str]

List of column names to be ignored during hashing.

['observed_at']
progress Optional[Progress]

Optional progress bar to track the status of the hashing.

None
exclude_tables List[ScModel]

Optional list of tables not to be hashed.

[]

Returns:

Type Description
Union[str, dict]

A single SHA1 hash or dict of hashes, depending on the level.

Source code in sc_crawler/utils.py
def hash_database(
    connection_string: str,
    level: HashLevels = HashLevels.DATABASE,
    ignored: List[str] = ["observed_at"],
    progress: Optional[Progress] = None,
    exclude_tables: List[ScModel] = [],
) -> Union[str, dict]:
    """Hash the content of a database.

    Args:
        connection_string: SQLAlchemy connection string to connect to the database.
        level: The level at which to apply hashing. Possible values are 'DATABASE' (default), 'TABLE', or 'ROW'.
        ignored: List of column names to be ignored during hashing.
        progress: Optional progress bar to track the status of the hashing.
        exclude_tables: Optional list of tables not to be hashed.

    Returns:
        A single SHA1 hash or dict of hashes, depending on the level.
    """
    from .tables import tables as alltables

    tables_to_sync = [t for t in alltables if t not in exclude_tables]

    if progress:
        tables_task_id = progress.add_task("Hashing tables", total=len(tables_to_sync))

    engine = create_engine(connection_string)

    with Session(engine) as session:
        hashes = {}
        for table in tables_to_sync:
            table_name = table.get_table_name()
            hashes[table_name] = table.hash(session, ignored=ignored, progress=progress)
            if progress:
                progress.update(tables_task_id, advance=1)

    if level == HashLevels.TABLE:
        hashes = {k: jsoned_hash(v) for k, v in hashes.items()}

    if level == HashLevels.DATABASE:
        hashes = jsoned_hash(hashes)

    return hashes

chunk_list #

chunk_list(items, size)

Split a list into chunks of a specified size.

Examples:

>>> [len(x) for x in chunk_list(range(10), 3)]
[3, 3, 3, 1]
Source code in sc_crawler/utils.py
def chunk_list(items: List[Any], size: int) -> Iterable[List[Any]]:
    """Split a list into chunks of a specified size.

    Examples:
        >>> [len(x) for x in chunk_list(range(10), 3)]
        [3, 3, 3, 1]
    """
    for i in range(0, len(items), size):
        yield items[i : i + size]

scmodels_to_dict #

scmodels_to_dict(scmodels, keys)

Creates a dict indexed by key(s) of the ScModels of the list.

When multiple keys are provided, each ScModel instance will be stored in the dict with all keys. If a key is a list, then each list element is considered (not recursively, only at first level) as a key. Conflict of keys is not checked.

Parameters:

Name Type Description Default
scmodels List[ScModel]

list of ScModel instances

required
keys List[str]

a list of strings referring to ScModel fields to be used as keys

required

Examples:

>>> from sc_crawler.vendors import aws
>>> scmodels_to_dict([aws], keys=["vendor_id", "name"])
{'aws': Vendor...
Source code in sc_crawler/utils.py
def scmodels_to_dict(scmodels: List[ScModel], keys: List[str]) -> Dict[str, ScModel]:
    """Creates a dict indexed by key(s) of the ScModels of the list.

    When multiple keys are provided, each ScModel instance will be stored in
    the dict with all keys. If a key is a list, then each list element is
    considered (not recursively, only at first level) as a key.
    Conflict of keys is not checked.

    Args:
        scmodels: list of ScModel instances
        keys: a list of strings referring to ScModel fields to be used as keys

    Examples:
        >>> from sc_crawler.vendors import aws
        >>> scmodels_to_dict([aws], keys=["vendor_id", "name"])
        {'aws': Vendor...
    """
    data = {}
    for key in keys:
        for scmodel in scmodels:
            data_keys = getattr(scmodel, key)
            if not isinstance(data_keys, list):
                data_keys = [data_keys]
            for data_key in data_keys:
                data[data_key] = scmodel
    return data

is_sqlite #

is_sqlite(session)

Checks if a SQLModel session is binded to a SQLite database.

Source code in sc_crawler/utils.py
def is_sqlite(session: Session) -> bool:
    """Checks if a SQLModel session is binded to a SQLite database."""
    return session.bind.dialect.name == "sqlite"

is_postgresql #

is_postgresql(session)

Checks if a SQLModel session is binded to a PostgreSQL-like database.

Dialect name is checked for PostgreSQL or CockroachDB.

Source code in sc_crawler/utils.py
def is_postgresql(session: Session) -> bool:
    """Checks if a SQLModel session is binded to a PostgreSQL-like database.

    Dialect name is checked for PostgreSQL or CockroachDB."""
    return session.bind.dialect.name in ["postgresql", "cockroachdb"]

float_inf_to_str #

float_inf_to_str(x)

Transform to string if a float is inf.

Source code in sc_crawler/utils.py
def float_inf_to_str(x: float) -> Union[float, str]:
    """Transform to string if a float is inf."""
    return "Infinity" if isinf(x) else x

table_name_to_model #

table_name_to_model(table_name)

Return the ScModel schema for a table name.

Source code in sc_crawler/utils.py
def table_name_to_model(table_name: str) -> ScModel:
    """Return the ScModel schema for a table name."""
    from .tables import tables

    return [t for t in tables if t.get_table_name() == table_name][0]

get_row_by_pk #

get_row_by_pk(session, model, pks)

Get a row from a table definition by primary keys.

Parameters:

Name Type Description Default
session Session

Connection for database connections.

required
model ScModel

An ScModel schema definition with table reference.

required
pks dict

Dictionary of all the primary keys for the row,.

required

Returns:

Type Description
ScModel

ScModel object read from the database.

Source code in sc_crawler/utils.py
def get_row_by_pk(session: Session, model: ScModel, pks: dict) -> ScModel:
    """Get a row from a table definition by primary keys.

    Args:
        session: Connection for database connections.
        model: An ScModel schema definition with table reference.
        pks: Dictionary of all the primary keys for the row,.

    Returns:
        ScModel object read from the database.
    """
    q = select(model)
    for k, v in pks.items():
        q = q.where(getattr(model, k) == v)
    return session.exec(statement=q).one()

nesteddefaultdict #

nesteddefaultdict()

Recursive defaultdict.

Examples:

>>> foo = nesteddefaultdict()
>>> foo["bar"]["baz"] = 43
>>> from json import dumps
>>> dumps(foo)
'{"bar": {"baz": 43}}'
Source code in sc_crawler/utils.py
def nesteddefaultdict():
    """Recursive defaultdict.

    Examples:
        >>> foo = nesteddefaultdict()
        >>> foo["bar"]["baz"] = 43
        >>> from json import dumps
        >>> dumps(foo)
        '{"bar": {"baz": 43}}'
    """
    return defaultdict(nesteddefaultdict)
list_search(items, key, values)

Search for a dict in a list with the given key/value pair.

When multiple values are provided, it will use the first field with a matching name with either keys.

Source code in sc_crawler/utils.py
def list_search(items: List[dict], key: str, values: Union[Any, List[Any]]) -> dict:
    """Search for a dict in a list with the given key/value pair.

    When multiple values are provided, it will use the first field with a
    matching name with either keys.
    """
    if not isinstance(values, list):
        values = [values]
    return next((item for item in items if item[key] in values), None)