Source code for HELPpy.preprocess.embedding

import pandas as pd
import numpy as np
import networkx as nx
from karateclub import DeepWalk, Node2Vec, AE
from typing import Dict
from tqdm import tqdm

[docs] def PPI_embed(df_net: pd.DataFrame, method: str='Node2Vec', dimensions: int=128, walk_number: int=10, walk_length: int=80, workers: int=1, epochs: int=1, learning_rate: float=0.05, seed: int=42, params: Dict={"p": 1.0, "q": 1.0, "window_size": 5, "min_count": 1}, source: str = 'A', target: str='B', weight: str='combined_score', verbose: bool=False): """ Embeds a protein-protein interaction (PPI) network using graph embedding techniques. :df_net pd.DataFrame: The input DataFrame containing the PPI network information. :method str: The graph embedding method. Options: 'DeepWalk', 'Node2Vec', 'AE'. Default: 'Node2Vec'. :dimensions int: The dimensionality of the embedding. Default: 128. :walk_number int: Number of walks per node. Default: 10. :walk_length int: Length of each walk. Default: 80. :workers int: Number of parallel workers. Default: 4. :epochs int: Number of training epochs. Default: 1. :learning_rate float: Learning rate for the embedding model. Default: 0.05. :seed int: Random seed for reproducibility. Default: 42. :params Dict: Additional parameters for the embedding method. Default: {"p": 1.0, "q": 1.0, "window_size": 5, "min_count": 1}. :source str: Column name for the source nodes in the PPI network DataFrame. Default: 'A'. :target str: Column name for the target nodes in the PPI network DataFrame. Default: 'B'. :weight str: Column name for the edge weights in the PPI network DataFrame. Default: 'combined_score'. :verbose bool: Whether to print progress information. Default: False. :return: DataFrame containing the node embeddings. :rtype: pd.DataFrame :example: .. code-block:: python df_embedding = PPI_embed(ppi_data, method='Node2Vec', dimensions=128, epochs=5, verbose=True) """ assert method in ['DeepWalk', 'Node2Vec', 'AE'], "Embedding method not supported!" params['dimensions'] = dimensions params['walk_length'] = walk_length params['walk_number'] = walk_number params['workers'] = workers params['epochs'] = epochs params['learning_rate'] = learning_rate params['seed'] = seed genes = np.union1d(df_net[source].values, df_net[target].values) idx2gene_mapping = dict(zip(np.arange(len(genes)), genes)) gene2idx_mapping = dict(zip(genes, np.arange(len(genes)))) # create mapping index by gene name edge_list = np.array([(gene2idx_mapping[v[0]], gene2idx_mapping[v[1]]) for v in list(df_net[[source,target]].values)]) edge_attr = df_net[[weight]].values.T.ravel() # Create the movies undirected graph. embedder = globals()[method](**params) if verbose: print(f"Embedding with {method} and params:") [print(f'{k}: {v}') for k,v in embedder.__dict__.items()] ppiG = nx.Graph() for pair, w in tqdm(zip(edge_list, edge_attr), total=len(edge_list), desc="Creating the PPI graph"): ppiG.add_edge(*pair, weight=w) if verbose: print("Total number of graph nodes:", ppiG.number_of_nodes()) if verbose: print("Total number of graph edges:", ppiG.number_of_edges()) degrees = [] for node in ppiG.nodes: degrees.append(ppiG.degree[node]) if verbose: print("Average node degree:", round(sum(degrees) / len(degrees), 2)) if verbose: print(f"There are {len(list(nx.isolates(ppiG)))} isolated genes") embedder.fit(ppiG) embedding = embedder.get_embedding() df_emb = pd.DataFrame(embedding, columns = [f'{method}_{i}' for i in range(embedding.shape[1])], index = [idx2gene_mapping[i] for i in range(len(genes))]) return df_emb