Source code for HELPpy.utility.selection

import pandas as pd
import numpy as np
import os
from typing import List, Dict, Tuple, Union, Callable
import matplotlib.pyplot as plt
from ..models.labelling import labelling
from ..visualization.plot import svenn_intesect
import random
from ..models.labelling import labelling
from ..visualization.plot import svenn_intesect
import random


[docs]
def delrows_with_nan_percentage(df: pd.DataFrame, perc: float=100., verbose=False):    
    """
    Filter rows in a DataFrame based on the percentage of NaN values.

    Parameters:
    :param: pd.DataFrame df: The input DataFrame.
    :param: float perc: The percentage of NaN values allowed in each row. Default is 0.0.

    :return: A new DataFrame with rows filtered based on the specified percentage of NaN values.
    :rtype: pd.DataFrame
    """
    min_count =  int(((100-perc)/100)*df.shape[1] + 1)  
    df_filtered = df.dropna(axis=0, thresh=min_count)
    if verbose:
        print(f"Removed {len(df)-len(df_filtered)} rows from {len(df)} with at least {perc}% NaN")
    return df_filtered



[docs]
def filter_crispr_by_model(df: pd.DataFrame, df_map: pd.DataFrame, minlines: int=1, 
                           line_colname: str='ModelID', line_group: str='OncotreeLineage'):
    """
    Filter a CRISPR DataFrame based on a mapping DataFrame and specified conditions.

    :param: pd.DataFrame df: The CRISPR DataFrame to be filtered.
    :param: pd.DataFrame df_map: The mapping DataFrame containing information about cell lines and models.
    :param: int minlines int: The minimum number of lines required for a tissue in the model. Default is 1.
    :param: str line_colname: The column name in both DataFrames representing the cell line ID. Default is 'ModelID'.
    :param: str line_group: The column name in the mapping DataFrame representing the tissue/lineage group. Default is 'OncotreeLineage'.

    :return: A new DataFrame with CRISPR data filtered based on the selected cell lines and conditions.
    :rtype: pd.DataFrame
    """
    # Get cell lines from the mapping DataFrame
    map_cell_lines = df_map[~df_map[line_group].isna()][line_colname].values

    # Intersect cell lines in the CRISPR DataFrame with cell lines in the mapping DataFrame
    dep_cell_lines = np.intersect1d(df.columns, map_cell_lines)

    # Filter mapping DataFrame based on common cell lines
    df_map_filtered = df_map[df_map[line_colname].isin(dep_cell_lines)]

    # Select tissue models with lines greater than or equal to minlines
    sel_cell_lines = filter_cellmap(df_map_filtered, minlines, line_group=line_group)

    # Return filtered CRISPR DataFrame based on selected cell lines
    return df[np.intersect1d(df.columns, sel_cell_lines[line_colname].values)]




[docs]
def filter_cellmap(df_map: pd.DataFrame, minlines: int=1, line_group: str='OncotreeLineage'):

    """
    Filters a cell map DataFrame based on the minimum number of lines per group.
    
    Parameters:
    :param pd.DataFrame df_map: The input DataFrame containing cell map information.
    :param int minlines: The minimum number of lines required to retain a group.
    :param str line_group: Column name for the grouping information in the cell map DataFrame. Default: 'OncotreeLineage'.
    
    :return: Filtered DataFrame containing only the groups that meet the minimum lines criteria.
    :rtype: pd.DataFrame
    
    :example:
    
    .. code-block:: python

        filtered_df = filter_cellmap(cell_map_data, minlines=10, line_group='OncotreeLineage')
    """
    tl = df_map[line_group].dropna().value_counts()
    tissue_list = [x[0] for x in list(filter(lambda x: x[1] >= minlines, zip(tl.index.values.astype(str) , tl.values)))]
    return df_map[df_map[line_group].isin(tissue_list)]

    
# select cell lines from depmap CRISPR file

[docs]
def select_cell_lines(df: pd.DataFrame, df_map: pd.DataFrame, tissue_list: Union[str,List[str]], 
                      line_group='OncotreeLineage', line_col='ModelID', nested=False, verbose=0):
    """
    Select cell lines based on tissue and mapping information.

    :param pd.DataFrame df: DataFrame containing cell line information.
    :param pd.DataFrame df_map: DataFrame containing mapping information.
    :param List[str] tissue_list: List of tissues for which cell lines need to be selected.
    :param str line_group: The column in 'df_map' to use for line selection (default is 'ModelID').
    :param str line_col: The column in 'df_map' to use for tissue selection (default is 'OncotreeLineage').
    :param bool nested: Whether to return cell lines as nested lists (lists for each tissue to enable mode of mode in labelling).
    :param int verbose: Verbosity level for printing information.

    :return: List of selected cell lines, either flattened or nested based on the 'nested' parameter.
    :rtype: List

    :example:

    .. code-block:: python

        df = pd.DataFrame(...)
        df_map = pd.DataFrame(...)
        tissue_list = ['Tissue1', 'Tissue2']
        selected_lines = select_cell_lines(df, df_map, tissue_list, line_group='OncotreeLineage', line_col='ModelID', nested=False, verbose=1)
    """
    lines = []

    if isinstance(tissue_list, str) and tissue_list!='all':
        raise Exception(f"tissue_list argument can be the string \"all\" or any list of strings!")

    # Threat the case that all tissues are selected for mode on tissue-specific labels
    if tissue_list=='all':
        # Get all cell lines from the mapping DataFrame 
        map_cell_lines = df_map[line_col].values
        # Intersect with cell lines in the main DataFrame
        dep_cell_lines = np.intersect1d(df.columns, map_cell_lines)
        # get tissue list cell lines belong to from mapping
        tissue_list=np.unique(df_map[df_map[line_col].isin(dep_cell_lines)][line_group].values)

    # Iterate over each tissue in the provided list
    for tissue in tissue_list:
        # Get the cell lines from the mapping DataFrame for the given tissue
        map_cell_lines = df_map[df_map[line_group] == tissue][line_col].values

        # Intersect with cell lines in the main DataFrame
        dep_cell_lines = np.intersect1d(df.columns, map_cell_lines)

        # check list of lines is not empyt (the tissue may not be in the Model)
        if len(dep_cell_lines)==0:
            raise Exception(f"empty lits of line ... the tissue {tissue} may not be in the model.")
        
        # Append cell lines to the result list (either nested or flattened)
        if nested:
            lines += [list(dep_cell_lines)]
        else:
            lines += list(dep_cell_lines)

        # Print verbose information if requested
        if verbose:
            print(f'There are {len(dep_cell_lines)} "{tissue}" cell-lines in the CRISPR file '
                f'in common with the {len(map_cell_lines)} cell-lines in DepMap')

    # Print total selected cell lines if verbose
    if verbose:
        print(f'A total of {len(lines)} have been selected for {tissue_list}.')

    # Return the list of selected cell lines
    return lines



[docs]
def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)



[docs]
def EG_tissues_intersect(tissues: Dict[str, pd.DataFrame], common_df: None or pd.DataFrame() = None,
                         labelname: str='label', labelval: str='E', display: bool = False, verbose: bool = False, 
                         barheight: int = 2, barwidth: int = 10, fontsize: int = 17) -> Tuple[set,set,set]:
    """
    Calculate the intersection and differences of gene sets across multiple tissues.

    :param Dict[str, pd.DataFrame] tissues: Dictionary of tissue names and associated dataframes.
    :param Union[None, pd.DataFrame] common_df: DataFrame containing common data.
    :param str labelname: Name of the label column in the dataframes.
    :param str labelval: Value to consider as the target label.
    :param bool display: Whether to display a Venn diagram.
    :param bool verbose: Whether to print verbose information.
    :param int barheight: Height of bars in the Venn diagram.
    :param int barwidth: Width of the Venn diagram.
    :param int fontsize: Font size of the Venn diagram labels.

    :return: A tuple containing sets of genes for each tissue,
             the intersection of genes, and differences in genes.
    :rtype: Tuple[Dict[str, set], set, Dict[str, set]] 

    :example:

    .. code-block:: python

        tissues = {'Tissue1': pd.DataFrame(...), 'Tissue2': pd.DataFrame(...), ...}
        common_df = pd.DataFrame(...)  # Optional
        sets, inset, diffs = EG_tissues_intersect(tissues, common_df, labelname='label', labelval='E', display=True)
    """
    sets = {}

    # If subtract_common is True, calculate the set of pan-tissue labels
    if common_df is None:
        common_set = set()
    else:
        common_set = set(common_df[common_df[labelname] == labelval].index.values)
        if verbose:
            print(f"Subtracting {len(common_set)} common EGs...")

    # Iterate over each tissue in the provided list
    for tissue, df in tissues.items():
        newset = set(df[df[labelname] == labelval].index.values)

        # subtract common eg labels
        newset = newset - common_set

        # Add the set of EGs for the tissue to the list
        sets[tissue] = newset

    # If display is True, display a Venn diagram
    if display:
        svenn_intesect(list(sets.values()), list(sets.keys()), figsize=(barwidth, barheight * len(tissues)), fontsize=fontsize)

    # Calculate the intersection of sets
    inset = set.intersection(*list(sets.values()))

    # Print verbose information about the overlapping genes
    if verbose:
        print(f'Overlapping of {len(inset)} genes between {list(sets.keys())}')

    # Calculate differences in EGs for each tissue
    setsl = list(sets.values())
    tl = list(sets.keys())
    diffs = {}
    for i, tl in enumerate(tl):
        setrest = setsl[:i] + setsl[i + 1:]
        if len(setrest) > 0:
            diffs[tl] = setsl[i] - set.union(*setrest)
        else:
            diffs[tl] = setsl[i]
        if verbose:
            print(f'{len(diffs[tl])} genes only in {tl}')

    # Return the sets of EGs, intersection, and differences
    return sets, inset, diffs

                             
# Compute intersection of essential genes by tissues

[docs]
def EG_tissues_intersect_dolabelling(df: pd.DataFrame, df_map: pd.DataFrame, tissues: List[str] = [], subtract_common: bool = False, three_class: bool = False,
                              display: bool = False, verbose: bool = False, barheight: int = 2, barwidth: int = 10, fontsize: int = 17) -> pd.DataFrame:
    """
    Identify overlapping and unique Essential Genes (EGs) by tissues.

    :param pd.DataFrame df: DataFrame containing cell line information.
    :param pd.DataFrame df_map: DataFrame containing mapping information.
    :param List[str] tissues: List of tissues for which EGs need to be identified.
    :param bool subtract_common: Whether to subtract common EGs from pantissue labeling.
    :param bool three_class: Whether to use a three-class labeling (E, NE, NC).
    :param bool display: Whether to display a Venn diagram.
    :param bool verbose: Verbosity level for printing information.
    :param int barheight: Height of the Venn diagram.
    :param int barwidth: Width of the Venn diagram.
    :param int fontsize: Font size for the Venn diagram.

    :return: Tuple containing sets of EGs, intersection of EGs, and differences in EGs.
    :rtype: Tuple[List[set], set, Dict[str, set]] 
    :example:

    .. code-block:: python

        df = pd.DataFrame(...)
        df_map = pd.DataFrame(...)
        tissues = ['Tissue1', 'Tissue2']
        sets, inset, diffs = EG_tissues_intersect_dolabelling(df, df_map, tissues, subtract_common=True, three_class=False, display=True, verbose=True)
    """
 
    sets = []

    # If subtract_common is True, calculate the set of pan-tissue labels
    if subtract_common:
        if verbose:
            print("Subtracting common EG of pan-tissue labeling")
        pan_labels_df = labelling(df, verbose=verbose)
        panset = set(pan_labels_df[pan_labels_df['label'] == 'E'].index.values)

    # Iterate over each tissue in the provided list
    for tissue in tissues:
        # Select cell lines for the given tissue
        cell_lines = select_cell_lines(df, df_map, [tissue])

        # If there are cell lines, calculate the set of EGs for the tissue
        if len(cell_lines) > 0:
            labels_df = labelling(df, cell_lines, verbose=verbose)
            newset = set(labels_df[labels_df['label'] == 'E'].index.values)

            # If subtract_common is True, subtract the pan-tissue labels
            if subtract_common:
                newset = newset - panset

            # Add the set of EGs for the tissue to the list
            sets += [newset]

    # If display is True, display a Venn diagram
    if display:
        svenn_intesect(sets, tissues, figsize=(barwidth, barheight * len(tissues)), fontsize=fontsize)

    # Calculate the intersection of sets
    inset = set.intersection(*sets)

    # Print verbose information about the overlapping genes
    if verbose:
        print(f'Overlapping of {len(inset)} genes')

    # Calculate differences in EGs for each tissue
    diffs = {}
    for i, tissue in enumerate(tissues):
        diffs[tissue] = sets[i] - set.union(*(sets[:i] + sets[i + 1:]))
        if verbose:
            print(f'{len(diffs[tissue])} genes only in {tissue}')

    # Return the sets of EGs, intersection, and differences
    return sets, inset, diffs