# vim: set fileencoding=<utf-8> :
# Copyright 2018-2023 John Lees and Nick Croucher
'''Network functions'''
# universal
import os
import sys
# additional
import operator
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from collections import defaultdict, Counter
from functools import partial
from multiprocessing import Pool
import pickle
import graph_tool.all as gt
# Load GPU libraries
try:
import cupyx
import cugraph
import cudf
import cupy as cp
from numba import cuda
import rmm
except ImportError:
pass
import poppunk_refine
from .__main__ import accepted_weights_types
from .__main__ import betweenness_sample_default
from .sketchlib import addRandom
from .utils import iterDistRows
from .utils import readIsolateTypeFromCsv
from .utils import check_and_set_gpu
from .unwords import gen_unword
[docs]def fetchNetwork(network_dir, model, refList, ref_graph = False,
core_only = False, accessory_only = False, use_gpu = False):
"""Load the network based on input options
Returns the network as a graph-tool format graph, and sets
the slope parameter of the passed model object.
Args:
network_dir (str)
A network used to define clusters
model (ClusterFit)
A fitted model object
refList (list)
Names of references that should be in the network
ref_graph (bool)
Use ref only graph, if available
[default = False]
core_only (bool)
Return the network created using only core distances
[default = False]
accessory_only (bool)
Return the network created using only accessory distances
[default = False]
use_gpu (bool)
Use cugraph library to load graph
Returns:
genomeNetwork (graph)
The loaded network
cluster_file (str)
The CSV of cluster assignments corresponding to this network
"""
# If a refined fit, may use just core or accessory distances
dir_prefix = network_dir + "/" + os.path.basename(network_dir)
if use_gpu:
graph_suffix = '.csv.gz'
else:
graph_suffix = '.gt'
if core_only and model.type == 'refine':
if ref_graph:
network_file = dir_prefix + '_core.refs_graph' + graph_suffix
else:
network_file = dir_prefix + '_core_graph' + graph_suffix
cluster_file = dir_prefix + '_core_clusters.csv'
elif accessory_only and model.type == 'refine':
if ref_graph:
network_file = dir_prefix + '_accessory.refs_graph' + graph_suffix
else:
network_file = dir_prefix + '_accessory_graph' + graph_suffix
cluster_file = dir_prefix + '_accessory_clusters.csv'
else:
if ref_graph and os.path.isfile(dir_prefix + '.refs_graph' + graph_suffix):
network_file = dir_prefix + '.refs_graph' + graph_suffix
else:
network_file = dir_prefix + '_graph' + graph_suffix
cluster_file = dir_prefix + '_clusters.csv'
if core_only or accessory_only:
sys.stderr.write("Can only do --core or --accessory fits from "
"a refined fit. Using the combined distances.\n")
# Load network file
sys.stderr.write("Loading network from " + network_file + "\n")
genomeNetwork = load_network_file(network_file, use_gpu = use_gpu)
# Ensure all in dists are in final network
checkNetworkVertexCount(refList, genomeNetwork, use_gpu)
return genomeNetwork, cluster_file
[docs]def load_network_file(fn, use_gpu = False):
"""Load the network based on input options
Returns the network as a graph-tool format graph, and sets
the slope parameter of the passed model object.
Args:
fn (str)
Network file name
use_gpu (bool)
Use cugraph library to load graph
Returns:
genomeNetwork (graph)
The loaded network
"""
# Load the network from the specified file
if use_gpu:
G_df = cudf.read_csv(fn, compression = 'gzip')
if 'src' in G_df.columns:
G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
genomeNetwork = cugraph.Graph()
if 'weights' in G_df.columns:
G_df = G_df[['source','destination','weights']]
genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
else:
genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n")
else:
genomeNetwork = gt.load_graph(fn)
sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
return genomeNetwork
[docs]def checkNetworkVertexCount(seq_list, G, use_gpu):
"""Checks the number of network vertices matches the number
of sequence names.
Args:
seq_list (list)
The list of sequence names
G (graph)
The network of sequences
use_gpu (bool)
Whether to use cugraph for graph analyses
"""
vertex_list = set(get_vertex_list(G, use_gpu = use_gpu))
networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
if len(networkMissing) > 0:
sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n")
if len(networkMissing) == 1:
missing_isolate_index = networkMissing.pop()
sys.stderr.write('Missing isolate is: ' + seq_list[missing_isolate_index] + ' (index ' + str(missing_isolate_index) + ')')
elif len(networkMissing) < 10:
sys.stderr.write('Missing isolates are: ' + ' '.join([seq_list[x] for x in networkMissing]))
sys.stderr.write('These have the indices ' + ' '.join([str(x) for x in networkMissing]))
sys.exit(1)
[docs]def getCliqueRefs(G, reference_indices = set()):
"""Recursively prune a network of its cliques. Returns one vertex from
a clique at each stage
Args:
G (graph)
The graph to get clique representatives from
reference_indices (set)
The unique list of vertices being kept, to add to
"""
cliques = gt.max_cliques(G)
try:
# Get the first clique, and see if it has any members already
# contained in the vertex list
clique = frozenset(next(cliques))
if clique.isdisjoint(reference_indices):
reference_indices.add(list(clique)[0])
# Remove the clique, and prune the resulting subgraph (recursively)
subgraph = gt.GraphView(G, vfilt=[v not in clique for v in G.vertices()])
if subgraph.num_vertices() > 1:
getCliqueRefs(subgraph, reference_indices)
elif subgraph.num_vertices() == 1:
reference_indices.add(subgraph.get_vertices()[0])
except StopIteration:
pass
return reference_indices
[docs]def cliquePrune(component, graph, reference_indices, components_list):
"""Wrapper function around :func:`~getCliqueRefs` so it can be
called by a multiprocessing pool
"""
if gt.openmp_enabled():
gt.openmp_set_num_threads(1)
sys.setrecursionlimit(3000)
subgraph = gt.GraphView(graph, vfilt=components_list == component)
refs = reference_indices.copy()
if subgraph.num_vertices() <= 2:
refs.add(subgraph.get_vertices()[0])
ref_list = refs
else:
ref_list = getCliqueRefs(subgraph, refs)
return(list(ref_list))
[docs]def translate_network_indices(G_ref_df, reference_indices):
"""Function for ensuring an updated reference network retains
numbering consistent with sample names
Args:
G_ref_df (cudf data frame)
List of edges in reference network
reference_indices (list)
The ordered list of reference indices in the original network
Returns:
G_ref (cugraph network)
Network of reference sequences
"""
# Translate network indices to match name order
G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()]
G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()]
G_ref = generate_cugraph(G_ref_df, len(reference_indices) - 1, renumber = True)
return(G_ref)
[docs]def writeReferences(refList, outPrefix, outSuffix = ""):
"""Writes chosen references to file
Args:
refList (list)
Reference names to write
outPrefix (str)
Prefix for output file
outSuffix (str)
Suffix for output file (.refs will be appended)
Returns:
refFileName (str)
The name of the file references were written to
"""
# write references to file
refFileName = outPrefix + "/" + os.path.basename(outPrefix) + outSuffix + ".refs"
with open(refFileName, 'w') as rFile:
for ref in refList:
rFile.write(ref + '\n')
return refFileName
[docs]def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False,
old_ids = None, previous_pkl = None, weights = False,
use_gpu = False):
"""Load previous network, extract the edges to match the
vertex order specified in rlist, and also return weights if specified.
Args:
prev_G_fn (str or graph object)
Path of file containing existing network, or already-loaded
graph object
adding_qq_dists (bool)
Boolean specifying whether query-query edges are being added
to an existing network, such that not all the sequence IDs will
be found in the old IDs, which should already be correctly ordered
rlist (list)
List of reference sequence labels in new network
old_ids (list)
List of IDs of vertices in existing network
previous_pkl (str)
Path of pkl file containing names of sequences in
previous network
weights (bool)
Whether to return edge weights
(default = False)
use_gpu (bool)
Whether to use cugraph for graph analyses
Returns:
source_ids (list)
Source nodes for each edge
target_ids (list)
Target nodes for each edge
edge_weights (list)
Weights for each new edge
"""
# Load graph from file if passed string; else use graph object passed in
# as argument
if isinstance(prev_G_fn, str):
prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu)
else:
prev_G = prev_G_fn
# load list of names in previous network if pkl name supplied
if previous_pkl is not None:
with open(previous_pkl, 'rb') as pickle_file:
old_rlist, old_qlist, self = pickle.load(pickle_file)
if self:
old_ids = old_rlist
else:
old_ids = old_rlist + old_qlist
elif old_ids is None:
sys.stderr.write('Missing .pkl file containing names of sequences in '
'previous network\n')
sys.exit(1)
# Get edges as lists of source,destination,weight using original IDs
if use_gpu:
G_df = prev_G.view_edge_list()
if weights:
if len(G_df.columns) < 3:
sys.stderr.write('Loaded network does not have edge weights; try a different '
'network or turn off graph weights\n')
exit(1)
if 'src' in G_df.columns:
G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True)
edge_weights = G_df['weights'].to_arrow().to_pylist()
G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
old_source_ids = G_df['source'].astype('int32').to_arrow().to_pylist()
old_target_ids = G_df['destination'].astype('int32').to_arrow().to_pylist()
else:
# get the source and target nodes
old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source")
old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target")
# get the weights
if weights:
if prev_G.edge_properties.keys() is None or 'weight' not in prev_G.edge_properties.keys():
sys.stderr.write('Loaded network does not have edge weights; try a different '
'network or turn off graph weights\n')
exit(1)
edge_weights = list(prev_G.ep['weight'])
# If appending queries to an existing network, then the recovered links can be left
# unchanged, as the new IDs are the queries, and the existing sequences will not be found
# in the list of IDs
if adding_qq_dists:
source_ids = old_source_ids
target_ids = old_target_ids
else:
try:
# Update IDs to new versions
old_id_indices = [rlist.index(x) for x in old_ids]
# translate to indices
source_ids = [old_id_indices[x] for x in old_source_ids]
target_ids = [old_id_indices[x] for x in old_target_ids]
except ValueError:
sys.stderr.write(f"Network size mismatch. Previous network nodes: {max(old_id_indices)}."
f"New network nodes: {max(old_source_ids.a)}/{max(old_target_ids.a)}\n")
sys.exit(1)
# return values
if weights:
return source_ids, target_ids, edge_weights
else:
return source_ids, target_ids
[docs]def print_network_summary(G, sample_size = None, betweenness_sample = betweenness_sample_default, use_gpu = False):
"""Wrapper function for printing network information
Args:
G (graph)
List of reference sequence labels
sample_size (int)
Number of nodes to subsample for graph statistic calculation
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
use_gpu (bool)
Whether to use GPUs for network construction
"""
# print some summaries
(metrics, scores) = networkSummary(G,
subsample = sample_size,
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]),
"\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]),
"\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]),
"\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]),
"\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]),
"\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]),
"\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
"\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
+ "\n")
[docs]def process_weights(distMat, weights_type):
"""Calculate edge weights from the distance matrix
Args:
distMat (2 column ndarray)
Numpy array of pairwise distances
weights_type (str)
Measure to calculate from the distMat to use as edge weights in network
- options are core, accessory or euclidean distance
Returns:
processed_weights (list)
Edge weights
"""
processed_weights = []
if weights_type is not None and distMat is not None:
# Check weights type is valid
if weights_type not in accepted_weights_types:
sys.stderr.write("Unable to calculate distance type " + str(weights_type) + "; "
"accepted types are " + str(accepted_weights_types) + "\n")
if weights_type == 'euclidean':
processed_weights = np.linalg.norm(distMat, axis = 1).tolist()
elif weights_type == 'core':
processed_weights = distMat[:, 0].tolist()
elif weights_type == 'accessory':
processed_weights = distMat[:, 1].tolist()
else:
sys.stderr.write('Require distance matrix to calculate distances\n')
return processed_weights
[docs]def process_previous_network(previous_network = None, adding_qq_dists = False, old_ids = None,
previous_pkl = None, vertex_labels = None, weights = False, use_gpu = False):
"""Extract edge types from an existing network
Args:
previous_network (str or graph object)
Name of file containing a previous network to be integrated into this new
network, or already-loaded graph object
adding_qq_dists (bool)
Boolean specifying whether query-query edges are being added
to an existing network, such that not all the sequence IDs will
be found in the old IDs, which should already be correctly ordered
old_ids (list)
Ordered list of vertex names in previous network
previous_pkl (str)
Name of file containing the names of the sequences in the previous_network
ordered based on the original network construction
vertex_labels (list)
Ordered list of sequence labels
weights (bool)
Whether weights should be extracted from the previous network
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
extra_sources (list)
List of source node identifiers
extra_targets (list)
List of destination node identifiers
extra_weights (list or None)
List of edge weights
"""
if previous_pkl is not None or old_ids is not None:
if weights:
# Extract from network
extra_sources, extra_targets, extra_weights = network_to_edges(previous_network,
vertex_labels,
adding_qq_dists = adding_qq_dists,
old_ids = old_ids,
previous_pkl = previous_pkl,
weights = True,
use_gpu = use_gpu)
else:
# Extract from network
extra_sources, extra_targets = network_to_edges(previous_network,
vertex_labels,
adding_qq_dists = adding_qq_dists,
old_ids = old_ids,
previous_pkl = previous_pkl,
weights = False,
use_gpu = use_gpu)
extra_weights = None
else:
sys.stderr.write('A distance pkl corresponding to ' + previous_pkl + ' is required for loading\n')
sys.exit(1)
return extra_sources, extra_targets, extra_weights
[docs]def construct_network_from_edge_list(rlist,
qlist,
edge_list,
weights = None,
distMat = None,
previous_network = None,
adding_qq_dists = False,
old_ids = None,
previous_pkl = None,
betweenness_sample = betweenness_sample_default,
summarise = True,
sample_size = None,
use_gpu = False):
"""Construct an undirected network using a list of edges as tuples. Nodes are samples and
edges where samples are within the same cluster
Will print summary statistics about the network to ``STDERR``
Args:
rlist (list)
List of reference sequence labels
qlist (list)
List of query sequence labels
edge_list (list of tuples)
List of tuples describing the edges of the graph
weights (list)
List of edge weights
distMat (2 column ndarray)
Numpy array of pairwise distances
previous_network (str or graph object)
Name of file containing a previous network to be integrated into this new
network, or the already-loaded graph object
adding_qq_dists (bool)
Boolean specifying whether query-query edges are being added
to an existing network, such that not all the sequence IDs will
be found in the old IDs, which should already be correctly ordered
old_ids (list)
Ordered list of vertex names in previous network
previous_pkl (str)
Name of file containing the names of the sequences in the previous_network
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
summarise (bool)
Whether to calculate and print network summaries with :func:`~networkSummary`
(default = True)
sample_size (int)
Number of nodes to subsample for graph statistic calculation
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
G (graph)
The resulting network
"""
# data structures
if rlist != qlist:
vertex_labels = rlist + qlist
else:
vertex_labels = rlist
# Create new network
if use_gpu:
# benchmarking concurs with https://stackoverflow.com/questions/55922162/recommended-cudf-dataframe-construction
if len(edge_list) > 1:
edge_array = cp.array(edge_list, dtype = np.int32)
edge_gpu_matrix = cuda.to_device(edge_array)
G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination'])
elif len(edge_list) == 1:
# Cannot generate an array when one edge
G_df = cudf.DataFrame(columns = ['source','destination'])
G_df['source'] = [edge_list[0][0]]
G_df['destination'] = [edge_list[0][1]]
else:
sys.stderr.write('ERROR: Missing link in graph assignment')
sys.exit()
if weights is not None:
G_df['weights'] = weights
G = construct_network_from_df(rlist, qlist, G_df,
weights = (weights is not None),
distMat = distMat,
adding_qq_dists = adding_qq_dists,
old_ids = old_ids,
previous_network = previous_network,
previous_pkl = previous_pkl,
summarise = False,
use_gpu = use_gpu)
else:
# Load previous network
if previous_network is not None:
extra_sources, extra_targets, extra_weights = \
process_previous_network(previous_network = previous_network,
adding_qq_dists = adding_qq_dists,
old_ids = old_ids,
previous_pkl = previous_pkl,
vertex_labels = vertex_labels,
weights = (weights is not None),
use_gpu = use_gpu)
# Construct list of tuples for graph-tool
# Include information from previous graph if supplied
if weights is not None:
weighted_edges = []
for ((src, dest), weight) in zip(edge_list, weights):
weighted_edges.append((src, dest, weight))
if previous_network is not None:
for (src, dest, weight) in zip(extra_sources, extra_targets, extra_weights):
weighted_edges.append((src, dest, weight))
edge_list = weighted_edges
else:
if previous_network is not None:
for (src, dest) in zip(extra_sources, extra_targets):
edge_list.append((src, dest))
# build the graph (from scratch)
#TODO append to existing graph
G = gt.Graph(directed = False)
G.add_vertex(len(vertex_labels))
if weights is not None:
eweight = G.new_ep("float")
G.add_edge_list(edge_list, eprops = [eweight])
G.edge_properties["weight"] = eweight
else:
G.add_edge_list(edge_list)
if summarise:
print_network_summary(G,
sample_size = sample_size,
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
return G
[docs]def construct_network_from_df(rlist,
qlist,
G_df,
weights = False,
distMat = None,
previous_network = None,
adding_qq_dists = False,
old_ids = None,
previous_pkl = None,
betweenness_sample = betweenness_sample_default,
summarise = True,
sample_size = None,
use_gpu = False):
"""Construct an undirected network using a data frame of edges. Nodes are samples and
edges where samples are within the same cluster
Will print summary statistics about the network to ``STDERR``
Args:
rlist (list)
List of reference sequence labels
qlist (list)
List of query sequence labels
G_df (cudf or pandas data frame)
Data frame in which the first two columns are the nodes linked by edges
weights (bool)
Whether weights in the G_df data frame should be included in the network
distMat (2 column ndarray)
Numpy array of pairwise distances
previous_network (str or graph object)
Name of file containing a previous network to be integrated into this new
network, or the already-loaded graph object
adding_qq_dists (bool)
Boolean specifying whether query-query edges are being added
to an existing network, such that not all the sequence IDs will
be found in the old IDs, which should already be correctly ordered
old_ids (list)
Ordered list of vertex names in previous network
previous_pkl (str)
Name of file containing the names of the sequences in the previous_network
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
summarise (bool)
Whether to calculate and print network summaries with :func:`~networkSummary`
(default = True)
sample_size (int)
Number of nodes to subsample for graph statistic calculation
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
G (graph)
The resulting network
"""
# data structures
if rlist != qlist:
vertex_labels = rlist + qlist
else:
vertex_labels = rlist
# Check df format is correct
if weights:
G_df.columns = ['source','destination','weights']
else:
G_df.columns = ['source','destination']
# Load previous network
if previous_network is not None:
extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network,
adding_qq_dists = adding_qq_dists,
old_ids = old_ids,
previous_pkl = previous_pkl,
vertex_labels = vertex_labels,
weights = weights,
use_gpu = use_gpu)
if use_gpu:
G_extra_df = cudf.DataFrame()
else:
G_extra_df = pd.DataFrame()
G_extra_df['source'] = extra_sources
G_extra_df['destination'] = extra_targets
if extra_weights is not None:
G_extra_df['weights'] = extra_weights
G_df = cudf.concat([G_df,G_extra_df], ignore_index = True)
if use_gpu:
# direct conversion
# ensure the highest-integer node is included in the edge list
# by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206
max_in_vertex_labels = len(vertex_labels)-1
use_weights = False
if weights:
use_weights = True
G = generate_cugraph(G_df, max_in_vertex_labels, weights = use_weights, renumber = False)
else:
# Convert bool to list of weights or None
if weights:
weights = G_df['weights']
else:
weights = None
# Convert data frame to list of tuples
connections = list(zip(*[G_df[c].values.tolist() for c in G_df[['source','destination']]]))
G = construct_network_from_edge_list(rlist, qlist, connections,
weights = weights,
distMat = distMat,
previous_network = previous_network,
old_ids = old_ids,
previous_pkl = previous_pkl,
summarise = False,
use_gpu = use_gpu)
if summarise:
print_network_summary(G,
sample_size = sample_size,
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
return G
[docs]def construct_network_from_sparse_matrix(rlist,
qlist,
sparse_input,
weights = None,
previous_network = None,
previous_pkl = None,
betweenness_sample = betweenness_sample_default,
summarise = True,
sample_size = None,
use_gpu = False):
"""Construct an undirected network using a sparse matrix. Nodes are samples and
edges where samples are within the same cluster
Will print summary statistics about the network to ``STDERR``
Args:
rlist (list)
List of reference sequence labels
qlist (list)
List of query sequence labels
sparse_input (numpy.array)
Sparse distance matrix from lineage fit
weights (list)
List of weights for each edge in the network
distMat (2 column ndarray)
Numpy array of pairwise distances
previous_network (str)
Name of file containing a previous network to be integrated into this new
network
previous_pkl (str)
Name of file containing the names of the sequences in the previous_network
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
summarise (bool)
Whether to calculate and print network summaries with :func:`~networkSummary`
(default = True)
sample_size (int)
Number of nodes to subsample for graph statistic calculation
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
G (graph)
The resulting network
"""
if use_gpu:
G_df = cudf.DataFrame()
else:
G_df = pd.DataFrame()
G_df['source'] = sparse_input.row
G_df['destination'] = sparse_input.col
G_df['weights'] = sparse_input.data
G = construct_network_from_df(rlist, qlist, G_df,
weights = True,
previous_network = previous_network,
previous_pkl = previous_pkl,
betweenness_sample = betweenness_sample,
summarise = False,
use_gpu = use_gpu)
if summarise:
print_network_summary(G,
sample_size = sample_size,
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
return G
[docs]def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gpu = False):
"""Construct an undirected network using sequence lists, assignments of pairwise distances
to clusters, and the identifier of the cluster assigned to within-strain distances.
Nodes are samples and edges where samples are within the same cluster
Will print summary statistics about the network to ``STDERR``
Args:
rlist (list)
List of reference sequence labels
distMat (2 column ndarray)
Numpy array of pairwise distances
weights_type (str)
Type of weight to use for network
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
G (graph)
The resulting network
"""
# data structures
vertex_labels = rlist
# Filter weights to only the relevant edges
if weights is None:
sys.stderr.write("Need weights to construct weighted network\n")
sys.exit(1)
# Process weights
weights = process_weights(distMat, weights_type)
# Convert edge indices to tuples
edge_list = poppunk_refine.generateAllTuples(num_ref = len(rlist),
self = True,
int_offset = 0)
if use_gpu:
# Construct network with GPU via data frame
G_df = cudf.DataFrame(columns = ['source','destination'])
G_df['source'] = [edge_list[0][0]]
G_df['destination'] = [edge_list[0][1]]
G_df['weights'] = weights
max_in_vertex_labels = len(vertex_labels)-1
G = generate_cugraph(G_df, max_in_vertex_labels, weights = True, renumber = False)
else:
# Construct network with CPU via edge list
weighted_edges = []
for ((src, dest), weight) in zip(edge_list, weights):
weighted_edges.append((src, dest, weight))
# build the graph
G = gt.Graph(directed = False)
G.add_vertex(len(vertex_labels))
eweight = G.new_ep("float")
# Could alternatively assign weights through eweight.a = weights
G.add_edge_list(weighted_edges, eprops = [eweight])
G.edge_properties["weight"] = eweight
return G
[docs]def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0,
weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None,
adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default,
summarise = True, sample_size = None, use_gpu = False):
"""Construct an undirected network using sequence lists, assignments of pairwise distances
to clusters, and the identifier of the cluster assigned to within-strain distances.
Nodes are samples and edges where samples are within the same cluster
Will print summary statistics about the network to ``STDERR``
Args:
rlist (list)
List of reference sequence labels
qlist (list)
List of query sequence labels
assignments (numpy.array or int)
Labels of most likely cluster assignment
within_label (int)
The label for the cluster representing within-strain distances
int_offset (int)
Constant integer to add to each node index
weights (list)
List of weights for each edge in the network
distMat (2 column ndarray)
Numpy array of pairwise distances
weights_type (str)
Measure to calculate from the distMat to use as edge weights in network
- options are core, accessory or euclidean distance
previous_network (str)
Name of file containing a previous network to be integrated into this new
network
old_ids (list)
Ordered list of vertex names in previous network
adding_qq_dists (bool)
Boolean specifying whether query-query edges are being added
to an existing network, such that not all the sequence IDs will
be found in the old IDs, which should already be correctly ordered
previous_pkl (str)
Name of file containing the names of the sequences in the previous_network
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
summarise (bool)
Whether to calculate and print network summaries with :func:`~networkSummary`
(default = True)
sample_size (int)
Number of nodes to subsample for graph statistic calculation
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
G (graph)
The resulting network
"""
# Filter weights to only the relevant edges
if weights is not None:
weights = weights[assignments == within_label]
elif distMat is not None and weights_type is not None:
if isinstance(assignments, list):
assignments = np.array(assignments)
distMat = distMat[assignments == within_label,:]
weights = process_weights(distMat, weights_type)
# Convert edge indices to tuples
connections = poppunk_refine.generateTuples(assignments,
within_label,
self = (rlist == qlist),
num_ref = len(rlist),
int_offset = int_offset)
# Construct network using edge list
G = construct_network_from_edge_list(rlist, qlist, connections,
weights = weights,
distMat = distMat,
previous_network = previous_network,
adding_qq_dists = adding_qq_dists,
old_ids = old_ids,
previous_pkl = previous_pkl,
summarise = False,
use_gpu = use_gpu)
if summarise:
print_network_summary(G,
sample_size = sample_size,
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
return G
[docs]def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sample_default,
subsample = None, use_gpu = False):
"""Provides summary values about the network
Args:
G (graph)
The network of strains
calc_betweenness (bool)
Whether to calculate betweenness stats
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
subsample (int)
Number of vertices to randomly subsample from graph
use_gpu (bool)
Whether to use cugraph for graph analysis
Returns:
metrics (list)
List with # components, density, transitivity, mean betweenness
and weighted mean betweenness
scores (list)
List of scores
"""
if use_gpu:
if subsample is None:
S = G
else:
vertex_subsample = cp.random.choice(cp.arange(0,G.number_of_vertices() - 1),
size = subsample,
replace = False)
S = cugraph.subgraph(G, vertex_subsample)
component_assignments = cugraph.components.connectivity.connected_components(S)
component_nums = component_assignments['labels'].unique().astype(int)
components = len(component_nums)
density = S.number_of_edges()/(0.5 * S.number_of_vertices() * S.number_of_vertices() - 1)
# need to check consistent with graph-tool
triangle_counts = cugraph.triangle_count(S)
triangle_count = triangle_counts['counts'].sum()/3
degree_df = S.in_degree()
# consistent with graph-tool
triad_count = 0.5 * sum([d * (d - 1) for d in degree_df[degree_df['degree'] > 1]['degree'].to_pandas()])
if triad_count > 0:
transitivity = triangle_count/triad_count
else:
transitivity = 0.0
else:
if subsample is None:
S = G
else:
vertex_subsample = np.random.choice(np.arange(0,len(list(G.vertices())) - 1),
size = subsample,
replace = False)
vfilt_bool = np.full(len(list(G.vertices())) - 1, False)
vfilt_bool[vertex_subsample] = True
vfilt = G.new_vertex_property('bool', vals = vfilt_bool)
S = gt.GraphView(G, vfilt=vfilt)
component_assignments, component_frequencies = gt.label_components(S)
components = len(component_frequencies)
density = len(list(S.edges()))/(0.5 * len(list(S.vertices())) * (len(list(S.vertices())) - 1))
transitivity = gt.global_clustering(S)[0]
mean_bt = 0
weighted_mean_bt = 0
if calc_betweenness:
betweenness = []
sizes = []
if use_gpu:
component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
for component in component_nums.to_pandas():
size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int)
if size > 3:
component_vertices = component_assignments['vertex'][component_assignments['labels']==component]
subgraph = cugraph.subgraph(S, component_vertices)
if len(component_vertices) >= betweenness_sample:
component_betweenness = cugraph.betweenness_centrality(subgraph,
k = betweenness_sample,
normalized = True)
else:
component_betweenness = cugraph.betweenness_centrality(subgraph,
normalized = True)
betweenness.append(component_betweenness['betweenness_centrality'].max())
sizes.append(size)
else:
for component, size in enumerate(component_frequencies):
if size > 3:
vfilt = component_assignments.a == component
subgraph = gt.GraphView(S, vfilt=vfilt)
betweenness.append(max(vertex_betweenness(subgraph, norm=True)))
sizes.append(size)
if len(betweenness) > 1:
mean_bt = np.mean(betweenness)
weighted_mean_bt = np.average(betweenness, weights=sizes)
elif len(betweenness) == 1:
mean_bt = betweenness[0]
weighted_mean_bt = betweenness[0]
# Calculate scores
metrics = [components, density, transitivity, mean_bt, weighted_mean_bt]
base_score = transitivity * (1 - density)
scores = [base_score, base_score * (1 - metrics[3]), base_score * (1 - metrics[4])]
return(metrics, scores)
# graph-tool only, for now
[docs]def vertex_betweenness(graph, norm=True):
"""Returns betweenness for nodes in the graph
"""
return gt.betweenness(graph, norm=norm)[0].a
[docs]def addQueryToNetwork(dbFuncs, rList, qList, G,
assignments, model, queryDB, kmers = None, distance_type = 'euclidean',
queryQuery = False, strand_preserved = False, weights = None, threads = 1,
use_gpu = False):
"""Finds edges between queries and items in the reference database,
and modifies the network to include them.
Args:
dbFuncs (list)
List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs`
rList (list)
List of reference names
qList (list)
List of query names
G (graph)
Network to add to (mutated)
assignments (numpy.array)
Cluster assignment of items in qlist
model (ClusterModel)
Model fitted to reference database
queryDB (str)
Query database location
distances (str)
Prefix of distance files for extending network
kmers (list)
List of k-mer sizes
distance_type (str)
Distance type to use as weights in network
queryQuery (bool)
Add in all query-query distances
(default = False)
strand_preserved (bool)
Whether to treat strand as known (i.e. ignore rc k-mers)
when adding random distances. Only used if queryQuery = True
[default = False]
weights (numpy.array)
If passed, the core,accessory distances for each assignment, which will
be annotated as an edge attribute
threads (int)
Number of threads to use if new db created
use_gpu (bool)
Whether to use cugraph for analysis
(default = 1)
Returns:
distMat (numpy.array)
Query-query distances
"""
# initalise functions
queryDatabase = dbFuncs['queryDatabase']
if len(qList) > 1 and kmers is None:
raise RuntimeError("Must provide db querying info (kmers) if adding "
"more than one sample, as q-q dists may be needed")
# do not calculate weights unless specified
if weights is None:
weights_type = None
else:
weights_type = distance_type
# These are returned
qqDistMat = None
# store links for each query in a list of edge tuples
ref_count = len(rList)
# Add queries to network
G = construct_network_from_assignments(rList,
qList,
assignments,
within_label = model.within_label,
previous_network = G,
old_ids = rList,
distMat = weights,
weights_type = weights_type,
summarise = False,
use_gpu = use_gpu)
# Check if any queries were not assigned, run qq dists if so
if not queryQuery:
if use_gpu:
edge_count = G.degree(list(range(ref_count, ref_count + len(qList))))
new_query_clusters = edge_count['degree'].isin([0]).iloc[0]
else:
edge_count = G.get_total_degrees(list(range(ref_count, ref_count + len(qList))))
new_query_clusters = np.any(edge_count == 0)
if new_query_clusters:
sys.stderr.write("Found novel query clusters. Calculating distances between them.\n")
queryQuery = True
# Calculate all query-query distances too, if updating database
if queryQuery:
if len(qList) == 1:
qqDistMat = np.zeros((0, 2), dtype=np.float32)
else:
sys.stderr.write("Calculating all query-query distances\n")
addRandom(queryDB, qList, kmers, strand_preserved, threads = threads)
qqDistMat = queryDatabase(rNames = qList,
qNames = qList,
dbPrefix = queryDB,
queryPrefix = queryDB,
klist = kmers,
self = True,
number_plot_fits = 0,
threads = threads)
if distance_type == 'core':
queryAssignation = model.assign(qqDistMat, slope = 0)
elif distance_type == 'accessory':
queryAssignation = model.assign(qqDistMat, slope = 1)
else:
queryAssignation = model.assign(qqDistMat)
# Add queries to network
vertex_labels = rList + qList
G = construct_network_from_assignments(vertex_labels,
vertex_labels,
queryAssignation,
int_offset = ref_count,
within_label = model.within_label,
previous_network = G,
old_ids = vertex_labels,
adding_qq_dists = True,
distMat = qqDistMat,
weights_type = weights_type,
summarise = False,
use_gpu = use_gpu)
return G, qqDistMat
[docs]def generate_cugraph(G_df, max_index, weights = False, renumber = True):
"""Builds cugraph graph to ensure all nodes are included in
the graph, even if singletons.
Args:
G_df (cudf)
cudf data frame containing edge list
max_index (int)
The 0-indexed maximum of the node indices
renumber (bool)
Whether to renumber the vertices when added to the graph
Returns:
G_new (graph)
Dictionary of cluster assignments (keys are sequence names)
"""
# use self-loop to ensure all nodes are present
node_indices = cudf.Series(range(max_index+1), dtype = cp.int32)
G_self_loop = cudf.DataFrame()
G_self_loop['source'] = node_indices
G_self_loop['destination'] = node_indices
G_self_loop['weights'] = 0.0
# Build cudf
# Weights are needed to extract subgraphs
# see https://github.com/rapidsai/cugraph/blob/77d833ad/python/cugraph/cugraph/community/subgraph_extraction.py
if not weights:
G_df['weights'] = 1.0
G_df = cudf.concat([G_self_loop,G_df], ignore_index = True)
# Construct graph
G_new = cugraph.Graph()
G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber)
return G_new
[docs]def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
externalClusterCSV=None, printRef=True, printCSV=True,
clustering_type='combined', write_unwords=True,
use_gpu = False):
"""Get cluster assignments
Also writes assignments to a CSV file
Args:
G (graph)
Network used to define clusters
rlist (list)
Names of samples
outPrefix (str)
Prefix for output CSV
Default = None
oldClusterFile (str)
CSV with previous cluster assignments.
Pass to ensure consistency in cluster assignment name.
Default = None
externalClusterCSV (str)
CSV with cluster assignments from any source. Will print a file
relating these to new cluster assignments
Default = None
printRef (bool)
If false, print only query sequences in the output
Default = True
printCSV (bool)
Print results to file
Default = True
clustering_type (str)
Type of clustering network, used for comparison with old clusters
Default = 'combined'
write_unwords (bool)
Write clusters with a pronouncable name rather than numerical index
Default = True
use_gpu (bool)
Whether to use cugraph for network analysis
Returns:
clustering (dict)
Dictionary of cluster assignments (keys are sequence names)
"""
if oldClusterFile == None and printRef == False:
raise RuntimeError("Trying to print query clusters with no query sequences")
if write_unwords and not printCSV:
write_unwords = False
# get a sorted list of component assignments
if use_gpu:
component_assignments = cugraph.components.connectivity.connected_components(G)
component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
newClusters = [set() for _ in range(component_frequencies.size)]
for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
component = component_assignments['labels'].iloc[isolate_index].item()
component_rank_bool = component_frequencies.index == component
component_rank = int(cp.argmax(component_rank_bool))
newClusters[component_rank].add(isolate_name)
else:
component_assignments, component_frequencies = gt.label_components(G)
component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)
# use components to determine new clusters
newClusters = [set() for rank in range(len(component_frequency_ranks))]
for isolate_index, isolate_name in enumerate(rlist):
component = component_assignments.a[isolate_index]
component_rank = component_frequency_ranks[component]
newClusters[component_rank].add(isolate_name)
oldNames = set()
if oldClusterFile != None:
oldAllClusters = readIsolateTypeFromCsv(oldClusterFile, mode = 'external', return_dict = False)
oldClusters = oldAllClusters[list(oldAllClusters.keys())[0]]
# parse all previously used clusters, including those that are merged
parsed_oldClusters = set([int(item) for sublist in (x.split('_') for x in oldClusters) for item in sublist])
new_id = max(parsed_oldClusters) + 1 # 1-indexed
while new_id in parsed_oldClusters:
new_id += 1 # in case clusters have been merged
# Samples in previous clustering
for prev_cluster in oldClusters.values():
for prev_sample in prev_cluster:
oldNames.add(prev_sample)
# Assign each cluster a name
clustering = {}
foundOldClusters = []
cluster_unword = {}
if write_unwords:
unword_generator = gen_unword()
for newClsIdx, newCluster in enumerate(newClusters):
needs_unword = False
# Ensure consistency with previous labelling
if oldClusterFile != None:
merge = False
cls_id = None
# Samples in this cluster that are not queries
ref_only = oldNames.intersection(newCluster)
# A cluster with no previous observations
if len(ref_only) == 0:
cls_id = str(new_id) # harmonise data types; string flexibility helpful
new_id += 1
needs_unword = True
else:
# Search through old cluster IDs to find a match
for oldClusterName, oldClusterMembers in oldClusters.items():
join = ref_only.intersection(oldClusterMembers)
if len(join) > 0:
# Check cluster is consistent with previous definitions
if oldClusterName in foundOldClusters:
sys.stderr.write("WARNING: Old cluster " + oldClusterName + " split"
" across multiple new clusters\n")
else:
foundOldClusters.append(oldClusterName)
# Query has merged clusters
if len(join) < len(ref_only):
merge = True
needs_unword = True
if cls_id == None:
cls_id = oldClusterName
else:
cls_id += "_" + oldClusterName
# Exact match -> same name as before
elif len(join) == len(ref_only):
assert merge == False # should not have already been part of a merge
cls_id = oldClusterName
break
# Report merges
if merge:
merged_ids = cls_id.split("_")
sys.stderr.write("Clusters " + ",".join(merged_ids) + " have merged into " + cls_id + "\n")
# Otherwise just number sequentially starting from 1
else:
cls_id = newClsIdx + 1
needs_unword = True
if write_unwords and needs_unword:
unword = next(unword_generator)
else:
unword = None
for cluster_member in newCluster:
clustering[cluster_member] = cls_id
if unword is not None:
cluster_unword[cluster_member] = unword
# print clustering to file
if printCSV:
outFileName = outPrefix + "_clusters.csv"
with open(outFileName, 'w') as cluster_file:
cluster_file.write("Taxon,Cluster\n")
if write_unwords:
unword_file = open(outPrefix + "_unword_clusters.csv", 'w')
unword_file.write("Taxon,Cluster_name\n")
# sort the clusters by frequency - define a list with a custom sort order
# first line gives tuples e.g. (1, 28), (2, 17) - cluster 1 has 28 members, cluster 2 has 17 members
# second line takes first element - the cluster IDs sorted by frequency
freq_order = sorted(dict(Counter(clustering.values())).items(), key=operator.itemgetter(1), reverse=True)
freq_order = [x[0] for x in freq_order]
# iterate through cluster dictionary sorting by value using above custom sort order
for cluster_member, cluster_name in sorted(clustering.items(), key=lambda i:freq_order.index(i[1])):
if printRef or cluster_member not in oldNames:
cluster_file.write(",".join((cluster_member, str(cluster_name))) + "\n")
if write_unwords and cluster_member in cluster_unword:
unword_file.write(",".join((cluster_member, cluster_unword[cluster_member])) + "\n")
if write_unwords:
unword_file.close()
if externalClusterCSV is not None:
printExternalClusters(newClusters, externalClusterCSV, outPrefix, oldNames, printRef)
return(clustering)
[docs]def printExternalClusters(newClusters, extClusterFile, outPrefix,
oldNames, printRef = True):
"""Prints cluster assignments with respect to previously defined
clusters or labels.
Args:
newClusters (set iterable)
The components from the graph G, defining the PopPUNK clusters
extClusterFile (str)
A CSV file containing definitions of the external clusters for
each sample (does not need to contain all samples)
outPrefix (str)
Prefix for output CSV (_external_clusters.csv)
oldNames (list)
A list of the reference sequences
printRef (bool)
If false, print only query sequences in the output
Default = True
"""
# Object to store output csv datatable
d = defaultdict(list)
# Read in external clusters
extClusters = \
readIsolateTypeFromCsv(extClusterFile,
mode = 'external',
return_dict = True)
# Go through each cluster (as defined by poppunk) and find the external
# clusters that had previously been assigned to any sample in the cluster
for ppCluster in newClusters:
# Store clusters as a set to avoid duplicates
prevClusters = defaultdict(set)
for sample in ppCluster:
for extCluster in extClusters:
if sample in extClusters[extCluster]:
prevClusters[extCluster].add(extClusters[extCluster][sample])
# Go back through and print the samples that were found
for sample in ppCluster:
if printRef or sample not in oldNames:
d['sample'].append(sample)
for extCluster in extClusters:
if extCluster in prevClusters:
d[extCluster].append(";".join(prevClusters[extCluster]))
else:
d[extCluster].append("NA")
if "sample" not in d:
sys.stderr.write("WARNING: No new samples found, cannot write external clusters\n")
else:
pd.DataFrame(data=d).to_csv(outPrefix + "_external_clusters.csv",
columns = ["sample"] + list(extClusters.keys()),
index = False)
[docs]def generate_minimum_spanning_tree(G, from_cugraph = False):
"""Generate a minimum spanning tree from a network
Args:
G (network)
Graph tool network
from_cugraph (bool)
If a pre-calculated MST from cugraph
[default = False]
Returns:
mst_network (str)
Minimum spanning tree (as graph-tool graph)
"""
#
# Create MST
#
if from_cugraph:
mst_network = G
else:
sys.stderr.write("Starting calculation of minimum-spanning tree\n")
# Test if weighted network and calculate minimum spanning tree
if "weight" in G.edge_properties:
mst_edge_prop_map = gt.min_spanning_tree(G, weights = G.ep["weight"])
mst_network = gt.GraphView(G, efilt = mst_edge_prop_map)
mst_network = gt.Graph(mst_network, prune = True)
else:
sys.stderr.write("generate_minimum_spanning_tree requires a weighted graph\n")
raise RuntimeError("MST passed unweighted graph")
# Find seed nodes as those with greatest outdegree in each component
num_components = 1
seed_vertices = set()
if from_cugraph:
mst_df = cugraph.components.connectivity.connected_components(mst_network)
num_components_idx = mst_df['labels'].max()
num_components = mst_df.iloc[num_components_idx]['labels']
if num_components > 1:
mst_df['degree'] = mst_network.in_degree()['degree']
# idxmax only returns first occurrence of maximum so should maintain
# MST - check cuDF implementation is the same
max_indices = mst_df.groupby(['labels'])['degree'].idxmax()
seed_vertices = mst_df.iloc[max_indices]['vertex']
else:
component_assignments, component_frequencies = gt.label_components(mst_network)
num_components = len(component_frequencies)
if num_components > 1:
for component_index in range(len(component_frequencies)):
component_members = component_assignments.a == component_index
component = gt.GraphView(mst_network, vfilt = component_members)
component_vertices = component.get_vertices()
out_degrees = component.get_out_degrees(component_vertices)
seed_vertex = list(component_vertices[np.where(out_degrees == np.amax(out_degrees))])
seed_vertices.add(seed_vertex[0]) # Can only add one otherwise not MST
# If multiple components, add distances between seed nodes
if num_components > 1:
# Extract edges and maximum edge length - as DF for cugraph
# list of tuples for graph-tool
if from_cugraph:
# With cugraph the MST is already calculated
# so no extra edges can be retrieved from the graph
G_df = G.view_edge_list()
max_weight = G_df['weights'].max()
first_seed = seed_vertices.iloc[0]
G_seed_link_df = cudf.DataFrame()
G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size]
G_seed_link_df['src'] = seed_vertices.iloc[0]
G_seed_link_df['weights'] = seed_vertices.iloc[0]
G_df = cudf.concat([G_df,G_seed_link_df])
else:
# With graph-tool look to retrieve edges in larger graph
connections = []
max_weight = float(np.max(G.edge_properties["weight"].a))
# Identify edges between seeds to link components together
for ref in seed_vertices:
seed_edges = G.get_all_edges(ref, [G.ep['weight']])
found = False # Not all edges may be in graph
for seed_edge in seed_edges:
if seed_edge[1] in seed_vertices:
found = True
connections.append((seed_edge))
# TODO: alternative would be to requery the DB (likely quick)
if found == False:
for query in seed_vertices:
if query != ref:
connections.append((ref, query, max_weight))
# Construct graph
if from_cugraph:
mst_network = cugraph.Graph()
G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
mst_network.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
else:
seed_G = gt.Graph(directed = False)
seed_G.add_vertex(len(seed_vertex))
eweight = seed_G.new_ep("float")
seed_G.add_edge_list(connections, eprops = [eweight])
seed_G.edge_properties["weight"] = eweight
seed_mst_edge_prop_map = gt.min_spanning_tree(seed_G, weights = seed_G.ep["weight"])
seed_mst_network = gt.GraphView(seed_G, efilt = seed_mst_edge_prop_map)
# Insert seed MST into original MST - may be possible to use graph_union with include=True & intersection
deep_edges = seed_mst_network.get_edges([seed_mst_network.ep["weight"]])
mst_network.add_edge_list(deep_edges)
sys.stderr.write("Completed calculation of minimum-spanning tree\n")
return mst_network
[docs]def get_vertex_list(G, use_gpu = False):
"""Generate a list of node indices
Args:
G (network)
Graph tool network
use_gpu (bool)
Whether graph is a cugraph or not
[default = False]
Returns:
vlist (list)
List of integers corresponding to nodes
"""
if use_gpu:
vlist = range(G.number_of_vertices())
else:
vlist = list(G.vertices())
return vlist
[docs]def save_network(G, prefix = None, suffix = None, use_graphml = False,
use_gpu = False):
"""Save a network to disk
Args:
G (network)
Graph tool network
prefix (str)
Prefix for output file
use_graphml (bool)
Whether to output a graph-tool file
in graphml format
use_gpu (bool)
Whether graph is a cugraph or not
[default = False]
"""
file_name = prefix + "/" + os.path.basename(prefix)
if suffix is not None:
file_name = file_name + suffix
if use_gpu:
G.to_pandas_edgelist().to_csv(file_name + '.csv.gz',
compression='gzip', index = False)
else:
if use_graphml:
G.save(file_name + '.graphml',
fmt = 'graphml')
else:
G.save(file_name + '.gt',
fmt = 'gt')
[docs]def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False):
"""Generate a network from a lineage rank fit
Args:
sparse_mat (scipy or cupyx sparse matrix)
Sparse matrix of kNN from lineage fit
rlist (list)
List of sequence names
use_gpu (bool)
Whether GPU libraries should be used
Returns:
G (network)
Graph tool or cugraph network
"""
if use_gpu:
G_df = cudf.DataFrame(columns = ['source','destination','weights'])
G_df['source'] = sparse_mat.row
G_df['destination'] = sparse_mat.col
G_df['weights'] = sparse_mat.data
max_in_vertex_labels = len(rlist)-1
G = generate_cugraph(G_df, max_in_vertex_labels, weights = True, renumber = False)
else:
connections = []
for (src,dst) in zip(sparse_mat.row,sparse_mat.col):
connections.append(src,dst)
G = construct_network_from_edge_list(rlist,
rlist,
connections,
weights=sparse_mat.data,
summarise=False)
return G
[docs]def prune_graph(prefix, reflist, samples_to_keep, output_db_name, threads, use_gpu):
"""Keep only the specified sequences in a graph
Args:
prefix (str)
Name of directory containing network
reflist (list)
Ordered list of sequences of database
samples_to_keep (list)
The names of samples to be retained in the graph
output_db_name (str)
Name of output directory
threads (int)
Number of CPU threads to use when recalculating random match chances
[default = 1].
use_gpu (bool)
Whether graph is a cugraph or not
[default = False]
"""
if use_gpu:
graph_suffix = '.csv.gz'
else:
graph_suffix = '.gt'
network_found = False
for graph_name in ['_core.refs_graph','_core_graph','_accessory.refs_graph','_accessory_graph','.refs_graph','_graph']:
network_fn = f"{prefix}/{os.path.basename(prefix)}" + graph_name + graph_suffix
if os.path.exists(network_fn):
network_found = True
sys.stderr.write("Loading network from " + network_fn + "\n")
G = load_network_file(network_fn, use_gpu = use_gpu)
G_new = remove_nodes_from_graph(G, reflist, samples_to_keep, use_gpu)
save_network(G_new,
prefix = output_db_name,
suffix = '_graph',
use_graphml = False,
use_gpu = use_gpu)
if not network_found:
sys.stderr.write('No network file found for pruning\n')
[docs]def remove_nodes_from_graph(G,reflist, samples_to_keep, use_gpu):
"""Return a modified graph containing only the requested nodes
Args:
reflist (list)
Ordered list of sequences of database
samples_to_keep (list)
The names of samples to be retained in the graph
use_gpu (bool)
Whether graph is a cugraph or not
[default = False]
Returns:
G_new (graph)
Pruned graph
"""
samples_to_keep_set = frozenset(samples_to_keep)
if use_gpu:
# Identify indices
reference_indices = [i for (i,name) in enumerate(reflist) if name in samples_to_keep_set]
# Generate data frame
G_df = G.view_edge_list()
if 'src' in G_df.columns:
G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
# Filter data frame
G_new_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
# Translate network indices to match name order
G_new = translate_network_indices(G_new_df, reference_indices)
else:
reference_vertex = G.new_vertex_property('bool')
for n, vertex in enumerate(G.vertices()):
if reflist[n] in samples_to_keep_set:
reference_vertex[vertex] = True
else:
reference_vertex[vertex] = False
G_new = gt.GraphView(G, vfilt = reference_vertex)
G_new = gt.Graph(G_new, prune = True)
return G_new
[docs]def remove_non_query_components(G, rlist, qlist, use_gpu = False):
"""
Removes all components that do not contain a query sequence.
Args:
G (graph)
Network of queries linked to reference sequences
rlist (list)
List of reference sequence labels
qlist (list)
List of query sequence labels
use_gpu (bool)
Whether to use GPUs for network construction
Returns:
G (graph)
The resulting network
pruned_names (list)
The labels of the sequences in the pruned network
"""
components_with_query = []
combined_names = rlist + qlist
pruned_names = []
if use_gpu:
sys.stderr.write('Saving partial query graphs is not compatible with GPU networks yet\n')
sys.exit(1)
else:
# Identify network components containing queries
component_dict = gt.label_components(G)[0]
components_with_query = set()
# The number of reference sequences is len(rlist)
# These are the first len(rlist) vertices in the graph
# Queries that have been added have indices >len(rlist)
# Therefore these are the components to retain
for i in range(len(rlist),G.num_vertices()):
v = G.vertex(i) # Access vertex by index
components_with_query.add(component_dict[v])
# Create a boolean filter based on the list of component IDs
query_filter = G.new_vertex_property("bool")
for v in G.vertices():
query_filter[int(v)] = (component_dict[v] in components_with_query)
if query_filter[int(v)]:
pruned_names.append(combined_names[int(v)])
# Create a filtered graph with only the specified components
query_subgraph = gt.GraphView(G, vfilt=query_filter)
return query_subgraph, pruned_names