utils

sc_crawler.utils #

jsoned_hash #

jsoned_hash(*args, **kwargs)

Hash the JSON-dump of all positional and keyword arguments.

Examples:

>>> jsoned_hash(42)
'0211c62419aece235ba19582d3cf7fd8e25f837c'
>>> jsoned_hash(everything=42)
'8f8a7fcade8cb632b856f46fc64c1725ee387617'
>>> jsoned_hash(42, 42, everything=42)
'f04a77f000d85929b13de04b436c60a1272dfbf5'

Source code in sc_crawler/utils.py

def jsoned_hash(*args, **kwargs):
    """Hash the JSON-dump of all positional and keyword arguments.

    Examples:
        >>> jsoned_hash(42)
        '0211c62419aece235ba19582d3cf7fd8e25f837c'
        >>> jsoned_hash(everything=42)
        '8f8a7fcade8cb632b856f46fc64c1725ee387617'
        >>> jsoned_hash(42, 42, everything=42)
        'f04a77f000d85929b13de04b436c60a1272dfbf5'
    """
    return sha1(
        dumps({"args": args, "kwargs": kwargs}, sort_keys=True).encode()
    ).hexdigest()

hash_database #

hash_database(connection_string, level=HashLevels.DATABASE, ignored=['observed_at'], progress=None, exclude_tables=[])

Hash the content of a database.

Parameters:

Name	Type	Description	Default
`connection_string`	`str`	SQLAlchemy connection string to connect to the database.	required
`level`	`HashLevels`	The level at which to apply hashing. Possible values are 'DATABASE' (default), 'TABLE', or 'ROW'.	`DATABASE`
`ignored`	`List[str]`	List of column names to be ignored during hashing.	`['observed_at']`
`progress`	`Optional[Progress]`	Optional progress bar to track the status of the hashing.	`None`
`exclude_tables`	`List[ScModel]`	Optional list of tables not to be hashed.	`[]`

Returns:

Type	Description
`Union[str, dict]`	A single SHA1 hash or dict of hashes, depending on the level.

Source code in sc_crawler/utils.py

def hash_database(
    connection_string: str,
    level: HashLevels = HashLevels.DATABASE,
    ignored: List[str] = ["observed_at"],
    progress: Optional[Progress] = None,
    exclude_tables: List[ScModel] = [],
) -> Union[str, dict]:
    """Hash the content of a database.

    Args:
        connection_string: SQLAlchemy connection string to connect to the database.
        level: The level at which to apply hashing. Possible values are 'DATABASE' (default), 'TABLE', or 'ROW'.
        ignored: List of column names to be ignored during hashing.
        progress: Optional progress bar to track the status of the hashing.
        exclude_tables: Optional list of tables not to be hashed.

    Returns:
        A single SHA1 hash or dict of hashes, depending on the level.
    """
    from .tables import tables as alltables

    tables_to_sync = [t for t in alltables if t not in exclude_tables]

    if progress:
        tables_task_id = progress.add_task("Hashing tables", total=len(tables_to_sync))

    engine = create_engine(connection_string)

    with Session(engine) as session:
        hashes = {}
        for table in tables_to_sync:
            table_name = table.get_table_name()
            hashes[table_name] = table.hash(session, ignored=ignored, progress=progress)
            if progress:
                progress.update(tables_task_id, advance=1)

    if level == HashLevels.TABLE:
        hashes = {k: jsoned_hash(v) for k, v in hashes.items()}

    if level == HashLevels.DATABASE:
        hashes = jsoned_hash(hashes)

    return hashes

chunk_list #

chunk_list(items, size)

Split a list into chunks of a specified size.

Examples:

>>> [len(x) for x in chunk_list(range(10), 3)]
[3, 3, 3, 1]

Source code in sc_crawler/utils.py

def chunk_list(items: List[Any], size: int) -> Iterable[List[Any]]:
    """Split a list into chunks of a specified size.

    Examples:
        >>> [len(x) for x in chunk_list(range(10), 3)]
        [3, 3, 3, 1]
    """
    for i in range(0, len(items), size):
        yield items[i : i + size]

scmodels_to_dict #

scmodels_to_dict(scmodels, keys)

Creates a dict indexed by key(s) of the ScModels of the list.

When multiple keys are provided, each ScModel instance will be stored in the dict with all keys. If a key is a list, then each list element is considered (not recursively, only at first level) as a key. Conflict of keys is not checked.

Parameters:

Name	Type	Description	Default
`scmodels`	`List[ScModel]`	list of ScModel instances	required
`keys`	`List[str]`	a list of strings referring to ScModel fields to be used as keys	required

Examples:

>>> from sc_crawler.vendors import aws
>>> scmodels_to_dict([aws], keys=["vendor_id", "name"])
{'aws': Vendor...

Source code in sc_crawler/utils.py

def scmodels_to_dict(scmodels: List[ScModel], keys: List[str]) -> Dict[str, ScModel]:
    """Creates a dict indexed by key(s) of the ScModels of the list.

    When multiple keys are provided, each ScModel instance will be stored in
    the dict with all keys. If a key is a list, then each list element is
    considered (not recursively, only at first level) as a key.
    Conflict of keys is not checked.

    Args:
        scmodels: list of ScModel instances
        keys: a list of strings referring to ScModel fields to be used as keys

    Examples:
        >>> from sc_crawler.vendors import aws
        >>> scmodels_to_dict([aws], keys=["vendor_id", "name"])
        {'aws': Vendor...
    """
    data = {}
    for key in keys:
        for scmodel in scmodels:
            data_keys = getattr(scmodel, key)
            if not isinstance(data_keys, list):
                data_keys = [data_keys]
            for data_key in data_keys:
                data[data_key] = scmodel
    return data

is_sqlite #

is_sqlite(session)

Checks if a SQLModel session is binded to a SQLite database.

Source code in sc_crawler/utils.py

def is_sqlite(session: Session) -> bool:
    """Checks if a SQLModel session is binded to a SQLite database."""
    return session.bind.dialect.name == "sqlite"

is_postgresql #

is_postgresql(session)

Checks if a SQLModel session is binded to a PostgreSQL-like database.

Dialect name is checked for PostgreSQL or CockroachDB.

Source code in sc_crawler/utils.py

def is_postgresql(session: Session) -> bool:
    """Checks if a SQLModel session is binded to a PostgreSQL-like database.

    Dialect name is checked for PostgreSQL or CockroachDB."""
    return session.bind.dialect.name in ["postgresql", "cockroachdb"]

float_inf_to_str #

float_inf_to_str(x)

Transform to string if a float is inf.

Source code in sc_crawler/utils.py

def float_inf_to_str(x: float) -> Union[float, str]:
    """Transform to string if a float is inf."""
    return "Infinity" if isinf(x) else x

table_name_to_model #

table_name_to_model(table_name)

Return the ScModel schema for a table name.

Source code in sc_crawler/utils.py

def table_name_to_model(table_name: str) -> ScModel:
    """Return the ScModel schema for a table name."""
    from .tables import tables

    return [t for t in tables if t.get_table_name() == table_name][0]

get_row_by_pk #

get_row_by_pk(session, model, pks)

Get a row from a table definition by primary keys.

Parameters:

Name	Type	Description	Default
`session`	`Session`	Connection for database connections.	required
`model`	`ScModel`	An ScModel schema definition with table reference.	required
`pks`	`dict`	Dictionary of all the primary keys for the row,.	required

Returns:

Type	Description
`ScModel`	ScModel object read from the database.

Source code in sc_crawler/utils.py

def get_row_by_pk(session: Session, model: ScModel, pks: dict) -> ScModel:
    """Get a row from a table definition by primary keys.

    Args:
        session: Connection for database connections.
        model: An ScModel schema definition with table reference.
        pks: Dictionary of all the primary keys for the row,.

    Returns:
        ScModel object read from the database.
    """
    q = select(model)
    for k, v in pks.items():
        q = q.where(getattr(model, k) == v)
    return session.exec(statement=q).one()

nesteddefaultdict #

nesteddefaultdict()

Recursive defaultdict.

Examples:

>>> foo = nesteddefaultdict()
>>> foo["bar"]["baz"] = 43
>>> from json import dumps
>>> dumps(foo)
'{"bar": {"baz": 43}}'

Source code in sc_crawler/utils.py

def nesteddefaultdict():
    """Recursive defaultdict.

    Examples:
        >>> foo = nesteddefaultdict()
        >>> foo["bar"]["baz"] = 43
        >>> from json import dumps
        >>> dumps(foo)
        '{"bar": {"baz": 43}}'
    """
    return defaultdict(nesteddefaultdict)

list_search #

list_search(items, key, values)

Search for a dict in a list with the given key/value pair.

When multiple values are provided, it will use the first field with a matching name with either keys.

Source code in sc_crawler/utils.py

def list_search(items: List[dict], key: str, values: Union[Any, List[Any]]) -> dict:
    """Search for a dict in a list with the given key/value pair.

    When multiple values are provided, it will use the first field with a
    matching name with either keys.
    """
    if not isinstance(values, list):
        values = [values]
    return next((item for item in items if item[key] in values), None)