Source code for skmiscpy.utils

import os
import sys
from pathlib import Path
from functools import lru_cache
from typing import Any, Dict, List, Union, Tuple, Type

import pandas as pd



[docs]
def here(path: str) -> str:
    """
    Construct an absolute path relative to the project root directory.
    Requires an activated virtual environment to determine the project root.

    Parameters
    ----------
    path : str
        A relative path to be resolved from the project root.

    Returns
    -------
    str
        The absolute path constructed from the project root directory.

    Raises
    ------
    OSError
        If the script is not running inside an activated virtual environment,
        or if the `VIRTUAL_ENV` environment variable is not set, empty,
        or points to a non-existent directory.

    TypeError
        If the `path` parameter is not a string.

    ValueError
        If the `path` parameter is empty or is an absolute path.

    Examples
    --------
    1. Constructing a path to a file in the project:

    >>> from skmiscpy import here
    >>> here("data/input.csv")
    # If the project root is `/home/user/my_project` where you have a virtual env directory,
    # this will return an absolute path like
    # `/home/user/my_project/data/input.csv`.

    2. Constructing a path to a subdirectory:

    >>> here("src/my_module")
    # If the project root is `/home/user/my_project`, this will return an absolute path
    # like `/home/user/my_project/src/my_module`.

    3. Handling errors with an empty path:

    >>> here("")
    # Raises ValueError: The `path` parameter cannot be an empty string.

    4. Handling errors with an absolute path:

    >>> here("/absolute/path/to/file")
    # Raises ValueError: The `path` parameter must be relative, not absolute.

    5. Handling errors with an invalid path type:

    >>> here(123)
    # Raises TypeError: The `path` parameter must be a string.
    """
    if not path:
        raise ValueError("The `path` parameter cannot be an empty string.")

    if not isinstance(path, str):
        raise TypeError("The `path` parameter must be a string.")

    # Expand any environment variables in the path
    expanded_path = os.path.expandvars(path)

    queried_path = Path(expanded_path)

    if queried_path.is_absolute():
        raise ValueError("The `path` parameter must be relative, not absolute.")

    project_root = _get_project_root()
    full_abs_path = (project_root / queried_path).resolve()

    return str(full_abs_path)



@lru_cache(maxsize=1)
def _get_project_root():
    """Cache the project root to avoid repeated lookups."""
    if sys.prefix == sys.base_prefix:
        raise OSError("Virtual environment is not activated.")

    venv_env_var = os.environ.get("VIRTUAL_ENV")
    if not venv_env_var:
        raise OSError(
            "The VIRTUAL_ENV environment variable is not set or is empty. "
            "Ensure that a virtual environment is activated."
        )

    venv_path = Path(venv_env_var)

    if not venv_path.exists():
        raise OSError(
            f"The directory specified by VIRTUAL_ENV ({venv_path}) does not exist."
        )

    return venv_path.parent


def _check_required_columns(
    data: pd.DataFrame, required_columns: Union[str, List[str]]
) -> None:
    """
    Checks if the DataFrame contains the required columns.

    Parameters
    ----------
    data : pd.DataFrame
        The DataFrame to check for required columns.

    required_columns : str or list of str
        A column name or a list of column names that are required to be present in the DataFrame.

    Raises
    ------
    ValueError
        If any of the required columns are missing from the DataFrame.
    """
    if isinstance(required_columns, str):
        required_columns = {required_columns}
    elif isinstance(required_columns, list):
        required_columns = set(required_columns)
    else:
        raise TypeError("`required_columns` must be a string or a list of strings.")

    if not required_columns.issubset(data.columns):
        missing_cols = required_columns - set(data.columns)
        raise ValueError(
            f"The DataFrame is missing the following required columns: {', '.join(missing_cols)}"
        )


def _check_param_type(
    params: Dict[str, Any], param_type: Union[Type, Tuple[Type, ...]]
) -> None:
    """
    Checks if the provided parameters (given as a dictionary of names and values) are of the specified type or types.
    Raises a TypeError if any of them are not of the specified type or types.

    Parameters:
    -----------
    params : Dict[str, Any]
        A dictionary where the key is the parameter name and the value is the parameter value.

    param_type : Type or tuple of Type
        The type or tuple of types to check against (e.g., str, bool, (int, float)).
    """
    for param_name, param_value in params.items():
        if param_value is not None and not isinstance(param_value, param_type):
            raise TypeError(
                f"The `{param_name}` parameter must be of type {_get_type_name(param_type)}."
            )


def _get_type_name(param_type: Union[Type, Tuple[Type, ...]]) -> str:
    """Get the type name in string"""
    if isinstance(param_type, Type):
        return param_type.__name__
    else:
        return " or ".join(t.__name__ for t in param_type)


def _check_variance_positive(variance: float, custom_msg: str) -> None:
    """
    Check if the variance is strictly positive.

    Parameters
    ----------
    variance : float
        The variance value to check. It should be a positive number.
    var_name : str
        The name of the variable corresponding to the variance. This is used for error messages.

    Raises
    ------
    ValueError
        If the variance is not strictly positive.
    """
    if variance <= 0:
        raise ValueError(
            f"The variance of {custom_msg} must be strictly positive. Found: {variance}."
        )


def _check_proportion_within_range(proportion: float, custom_msg: str) -> None:
    """
    Check if the proportion is within the range [0, 1].

    Parameters
    ----------
    proportion : float
        The proportion value to check. It should be a number between 0 and 1 (inclusive).
    prop_name : str
        The name of the variable corresponding to the proportion. This is used for error messages.

    Raises
    ------
    ValueError
        If the proportion is not within the range [0, 1].
    """
    if not (0 < proportion < 1):
        raise ValueError(
            f"The {custom_msg} must be within the range (0, 1). Found: {proportion}."
        )


def _classify_columns(df: pd.DataFrame, columns: list[str]) -> dict[str, str]:
    """
    Classify columns in a DataFrame as either 'binary' or 'continuous'.

    Parameters:
    ----------
    df : pd.DataFrame
        The DataFrame containing the data.
    columns : list[str]
        A list of column names to be classified.

    Returns:
    -------
    dict[str, str]
        A dictionary where the keys are column names and the values are
        either 'binary' or 'continuous'.
        'binary' indicates the column has exactly 2 unique values, and
        'continuous' indicates the column has more than 2 unique values or
        is numeric.

    Notes:
    -----
    - The function assumes all columns are either binary or continuous and
      numeric, but does not check for object types.
    - Columns with exactly 2 unique values are classified as 'binary'.
    - Columns with more than 2 unique values or numeric types are classified
      as 'continuous'.
    """
    result = {}
    for col in columns:
        n_unique = df[col].nunique(dropna=True)
        result[col] = "binary" if n_unique == 2 else "continuous"
    return result