Module `skm_pyutils.table`

Utilities for pandas dataframes.

Expand source code

"""Utilities for pandas dataframes."""
import os

import numpy as np
import pandas as pd


def list_to_df(in_list, headers=None, transpose=False):
    """
    Convert a list to a dataframe with the given headers.

    Tranpose handles the list shape.
    If transpose is False, list is assumed to be like
    [
        (row 1) [1_1, 1_2, ..., 1_N]
        ...
        (row M) [M_1, M_2, ..., M_N]
    ]
    Otherwise, the list is assumed to be like
    [
        (col 1) [1_1, 2_1, ..., M_1]
        ...
        (col N) [1_N, 2_N, ..., M_N]
    ]

    Parameters
    ----------
    in_list : list
        The list to convert.
    tranpose : bool, optional
        Whether to transpose the list, by default False.
    headers : list, optional
        A list of headers for the data. By default is V1, V2, ... VN.

    Returns
    -------
    pandas.DataFrame

    """
    if headers is None:
        if not transpose:
            headers = ["V{}".format(i) for i in range(len(in_list[0]))]
        else:
            headers = ["V{}".format(i) for i in range(len(in_list))]

    if transpose:
        df = pd.DataFrame(in_list).T
        df.columns = headers
    else:
        df = pd.DataFrame.from_records(in_list, columns=headers)
    return df


def df_from_file(filename):
    """
    Read a pandas.DataFrame from filename.

    Parameters
    ----------
    filename : str
        The path to the file to read

    Returns
    -------
    pandas.DataFrame or TextParser
        The read data

    """
    ext = os.path.splitext(filename)[1]
    if ext == ".psv":
        df = pd.read_csv(filename, delimiter="|")
    elif ext == ".csv":
        df = pd.read_csv(filename)
    elif ext == ".xlsx":
        df = pd.read_excel(filename)
    else:
        raise ValueError(f"Unsupported file extension {ext}")
    return df


def df_to_file(df, filename, index=False, **kwargs):
    """
    Save a pandas.DataFrame to filename.

    Parameters
    ----------
    df : pandas.DataFrame
        The input dataframe to save.
    filename : str
        The path of the file to save to.
    index : bool
        Whether to write row names, by default False.
    kwargs : keyword arguments
        Passed to pandas method.

    Returns
    -------
    None

    """

    def get_to_retry():
        print(f"{filename} may currently be in use, try closing it")
        retry = True
        continue_ = True
        while retry:
            done = input("When closed, please enter y to retry, or q to quit:\n")
            if len(done) == 0:
                retry = True
            elif done.strip().lower() == "y":
                retry = False
            elif done.strip().lower() == "q":
                retry = False
                continue_ = False
        if continue_:
            print(f"Retrying saving to {filename}")
        return continue_

    ext = os.path.splitext(filename)[1]
    if os.path.dirname(filename) != "":
        os.makedirs(os.path.dirname(filename), exist_ok=True)
    if ext == ".psv":
        try:
            df.to_csv(filename, sep="|", index=index, **kwargs)
        except PermissionError:
            continue_ = get_to_retry()
            if continue_:
                df.to_csv(filename, sep="|", index=index, **kwargs)
    elif ext == ".csv":
        try:
            df.to_csv(filename, sep=",", index=index, **kwargs)
        except PermissionError:
            continue_ = get_to_retry()
            if continue_:
                df.to_csv(filename, sep=",", index=index, **kwargs)
    elif ext == ".xlsx":
        try:
            df.to_excel(filename, index=index, **kwargs)
        except PermissionError:
            continue_ = get_to_retry()
            if continue_:
                df.to_excel(filename, index=index, **kwargs)
    else:
        raise ValueError(f"Unsupported file extension {ext}")


def df_subset_from_rows(df, rows):
    """
    Get a subset of a dataframe based on row indices.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to subset from.
    rows : list-like of int
        The rows to grab.

    Returns
    -------
    pandas.DataFrame

    """
    return df.iloc[rows].copy().reset_index()


def show_interactive_table(table, notebook=False) -> None:
    import dtale

    ## TODO maybe should have a config for notebook version
    if notebook:
        dtale.show(table)
    else:
        dtale.show(table).open_browser()


def filter_table(
    table: "pd.DataFrame", filter_dict: "dict[str, list]", and_: "bool" = True
) -> "pd.DataFrame":
    """
    Filter a table based on a dictionary with possible values.

    Parameters
    ----------
    table: pd.DataFrame
        The table to filter
    filter_dict: dict[str, list]
        The dictionary to filter with.
        key = column in df, val = possible values for that column
    and_: bool, optional
        Whether to combine with logical_and or with logical_or,
        By default logical_and.

    Returns
    -------
    filtered_dataframe : pd.DataFrame
        The filtered_dataframe

    """
    if len(filter_dict) == 0:
        return table
    filters = [table[k].isin(v) for k, v in filter_dict.items()]
    if and_:
        full_mask = np.logical_and.reduce(np.array(filters))
    else:
        full_mask = np.logical_or.reduce(np.array(filters))
    return table[full_mask]

Functions

def df_from_file(filename)

Read a pandas.DataFrame from filename.

Parameters

filename : str: The path to the file to read

Returns

pandas.DataFrame or TextParser: The read data

Expand source code

def df_from_file(filename):
    """
    Read a pandas.DataFrame from filename.

    Parameters
    ----------
    filename : str
        The path to the file to read

    Returns
    -------
    pandas.DataFrame or TextParser
        The read data

    """
    ext = os.path.splitext(filename)[1]
    if ext == ".psv":
        df = pd.read_csv(filename, delimiter="|")
    elif ext == ".csv":
        df = pd.read_csv(filename)
    elif ext == ".xlsx":
        df = pd.read_excel(filename)
    else:
        raise ValueError(f"Unsupported file extension {ext}")
    return df

def df_subset_from_rows(df, rows)

Get a subset of a dataframe based on row indices.

Parameters

df : pandas.DataFrame: The dataframe to subset from.
rows : list-like of int: The rows to grab.

Returns

pandas.DataFrame

Expand source code

def df_subset_from_rows(df, rows):
    """
    Get a subset of a dataframe based on row indices.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to subset from.
    rows : list-like of int
        The rows to grab.

    Returns
    -------
    pandas.DataFrame

    """
    return df.iloc[rows].copy().reset_index()

def df_to_file(df, filename, index=False, **kwargs)

Save a pandas.DataFrame to filename.

Parameters

df : pandas.DataFrame: The input dataframe to save.
filename : str: The path of the file to save to.
index : bool: Whether to write row names, by default False.
kwargs : keyword arguments: Passed to pandas method.

Returns

None

Expand source code

def df_to_file(df, filename, index=False, **kwargs):
    """
    Save a pandas.DataFrame to filename.

    Parameters
    ----------
    df : pandas.DataFrame
        The input dataframe to save.
    filename : str
        The path of the file to save to.
    index : bool
        Whether to write row names, by default False.
    kwargs : keyword arguments
        Passed to pandas method.

    Returns
    -------
    None

    """

    def get_to_retry():
        print(f"{filename} may currently be in use, try closing it")
        retry = True
        continue_ = True
        while retry:
            done = input("When closed, please enter y to retry, or q to quit:\n")
            if len(done) == 0:
                retry = True
            elif done.strip().lower() == "y":
                retry = False
            elif done.strip().lower() == "q":
                retry = False
                continue_ = False
        if continue_:
            print(f"Retrying saving to {filename}")
        return continue_

    ext = os.path.splitext(filename)[1]
    if os.path.dirname(filename) != "":
        os.makedirs(os.path.dirname(filename), exist_ok=True)
    if ext == ".psv":
        try:
            df.to_csv(filename, sep="|", index=index, **kwargs)
        except PermissionError:
            continue_ = get_to_retry()
            if continue_:
                df.to_csv(filename, sep="|", index=index, **kwargs)
    elif ext == ".csv":
        try:
            df.to_csv(filename, sep=",", index=index, **kwargs)
        except PermissionError:
            continue_ = get_to_retry()
            if continue_:
                df.to_csv(filename, sep=",", index=index, **kwargs)
    elif ext == ".xlsx":
        try:
            df.to_excel(filename, index=index, **kwargs)
        except PermissionError:
            continue_ = get_to_retry()
            if continue_:
                df.to_excel(filename, index=index, **kwargs)
    else:
        raise ValueError(f"Unsupported file extension {ext}")

def filter_table(table: pd.DataFrame, filter_dict: dict[str, list], and_: bool = True) ‑> pandas.core.frame.DataFrame

Filter a table based on a dictionary with possible values.

Parameters

table : pd.DataFrame: The table to filter
filter_dict : dict[str, list]: The dictionary to filter with. key = column in df, val = possible values for that column
and_ : bool, optional: Whether to combine with logical_and or with logical_or, By default logical_and.

Returns

filtered_dataframe : pd.DataFrame: The filtered_dataframe

Expand source code

def filter_table(
    table: "pd.DataFrame", filter_dict: "dict[str, list]", and_: "bool" = True
) -> "pd.DataFrame":
    """
    Filter a table based on a dictionary with possible values.

    Parameters
    ----------
    table: pd.DataFrame
        The table to filter
    filter_dict: dict[str, list]
        The dictionary to filter with.
        key = column in df, val = possible values for that column
    and_: bool, optional
        Whether to combine with logical_and or with logical_or,
        By default logical_and.

    Returns
    -------
    filtered_dataframe : pd.DataFrame
        The filtered_dataframe

    """
    if len(filter_dict) == 0:
        return table
    filters = [table[k].isin(v) for k, v in filter_dict.items()]
    if and_:
        full_mask = np.logical_and.reduce(np.array(filters))
    else:
        full_mask = np.logical_or.reduce(np.array(filters))
    return table[full_mask]

def list_to_df(in_list, headers=None, transpose=False)

Convert a list to a dataframe with the given headers.

Tranpose handles the list shape. If transpose is False, list is assumed to be like [ (row 1) [1_1, 1_2, …, 1_N] … (row M) [M_1, M_2, …, M_N] ] Otherwise, the list is assumed to be like [ (col 1) [1_1, 2_1, …, M_1] … (col N) [1_N, 2_N, …, M_N] ]

Parameters

in_list : list: The list to convert.
tranpose : bool, optional: Whether to transpose the list, by default False.
headers : list, optional: A list of headers for the data. By default is V1, V2, … VN.

Returns

pandas.DataFrame

Expand source code

def list_to_df(in_list, headers=None, transpose=False):
    """
    Convert a list to a dataframe with the given headers.

    Tranpose handles the list shape.
    If transpose is False, list is assumed to be like
    [
        (row 1) [1_1, 1_2, ..., 1_N]
        ...
        (row M) [M_1, M_2, ..., M_N]
    ]
    Otherwise, the list is assumed to be like
    [
        (col 1) [1_1, 2_1, ..., M_1]
        ...
        (col N) [1_N, 2_N, ..., M_N]
    ]

    Parameters
    ----------
    in_list : list
        The list to convert.
    tranpose : bool, optional
        Whether to transpose the list, by default False.
    headers : list, optional
        A list of headers for the data. By default is V1, V2, ... VN.

    Returns
    -------
    pandas.DataFrame

    """
    if headers is None:
        if not transpose:
            headers = ["V{}".format(i) for i in range(len(in_list[0]))]
        else:
            headers = ["V{}".format(i) for i in range(len(in_list))]

    if transpose:
        df = pd.DataFrame(in_list).T
        df.columns = headers
    else:
        df = pd.DataFrame.from_records(in_list, columns=headers)
    return df

def show_interactive_table(table, notebook=False) ‑> None

Expand source code

def show_interactive_table(table, notebook=False) -> None:
    import dtale

    ## TODO maybe should have a config for notebook version
    if notebook:
        dtale.show(table)
    else:
        dtale.show(table).open_browser()