What is in utilities.py?
- This file will be downloaded along with data file in /home/pathwayanalysis once the first cell of script is executed.
Loading packages required for the calculation/analysis
import numpy as np
from networkx.drawing.nx_agraph import graphviz_layout
from graphviz import Digraph
import networkx as nx
from IPython.display import Image
File reader for mRNA sequence data and Cytoscape pathway network files
- mRNA sequence data: CSV file
- pathway network data: SIF file
def reader(filename, sif=False): if sif == True: rawdata = open(filename+'.sif','r') data = {} network_info = ['start_nodes','edge_features','end_nodes'] counter = 0 for line in rawdata: if line.strip(): line = line.strip("\n' '") line = line.split() if counter == 0: counter = counter + 1 for name in network_info: data[name]=[] for i in np.arange(len(network_info)): data[network_info[i]].append(line[i]) rawdata.close() else: rawdata = open(filename+'.csv','r') data = {} counter = 0 for line in rawdata: if line.strip(): if counter == 0: counter = counter + 1 column_names = line.strip("\n' '") data_name = line.split(",") # Creating the dictionary data structure for name in data_name: data[name]=[] else: line = line.strip("\n' '") line = line.split(",") # Storing the data in the dictionary for i in np.arange(4): element = line[i] data[data_name[i]].append(element) rawdata.close() return data
File reader for List of Receptors
def textreader(filename):
rawdata = open(filename+'.txt','r')
data = []
counter = 0
for line in rawdata:
if line.strip():
line = line.strip("\n' '")
data.append(str(line))
rawdata.close()
return data
File reader for protein expression data
- CSV file
def protReader(filename): rawdata = open(filename+'.csv','r') data = {} name = [] counter = 0 for line in rawdata: if line.strip(): if counter == 0: counter = counter + 1 continue else: line = line.strip("\n' '") line = line.split(",") # Storing the data in the dictionary if line[1] == "N/A": continue else: data[line[1]] = float(line[2]) name.append(line[1]) return data, name
Assigning edge weight based on the expression level reported in mRNA sequence data set
def prepare_process(networkFilename, mRNAseqFilename, protFilename = None):
networkdata = reader(networkFilename, sif=True)
rnadata = reader(mRNAseqFilename)
if protFilename:
protdata, protname = protReader(protFilename) # this will be fixed but for now, nothing will happen
num_of_edges = len(networkdata['start_nodes'])
unique_start = np.unique(networkdata['start_nodes'])
unique_end = np.unique(networkdata['end_nodes'])
unique_all = np.unique(networkdata['start_nodes'] + networkdata['end_nodes'])
M2_mRNA_Node = {}
M2_mRNA_Node_arr = []
M1M2_mRNA_Node = {}
M1M2_mRNA_Node_arr = []
M1_mRNA_Node = {}
M1_mRNA_Node_arr = []
# FC from M2 to M1M2 (specific case for Jill's project )
logFC_mRNA_Node_M2toM12 = {}
logFC_mRNA_Node_arr_M2toM12 = []
# FC from M12 to M2 (specific case for Jill's project )
logFC_mRNA_Node_M12toM2 = {}
logFC_mRNA_Node_arr_M12toM2 = []
# FC from M2 to M1 (M1 polarization from M2, general case)
logFC_mRNA_Node_M2toM1 = {}
logFC_mRNA_Node_arr_M2toM1 = []
# FC from M1 to M2 (M2 polarization from M1, general case)
logFC_mRNA_Node_M1toM2 = {}
logFC_mRNA_Node_arr_M1toM2 = []
for node_name in unique_all:
i = 0
for seq in rnadata['mgi_symbol']:
if node_name.lower() == seq.lower():
dataM1 = float(rnadata['M1'][i])
dataM2 = float(rnadata['M2'][i])
dataM1M2 = float(rnadata['M1M2\n'][i])
M2_mRNA_Node[node_name] = dataM2
M2_mRNA_Node_arr.append(dataM2)
M1_mRNA_Node[node_name] = dataM1
M1_mRNA_Node_arr.append(dataM1)
M1M2_mRNA_Node[node_name] = dataM1M2
M1M2_mRNA_Node_arr.append(dataM1M2)
# Convert any possible zero to small number
## division can screw this up
if dataM2 < 1e-14:
dataM2 = 1e-14
if dataM1 < 1e-14:
dataM1 = 1e-14
if dataM1M2 < 1e-14:
dataM1M2 = 1e-14
# FC from M2 to M1M2
logFCM12M2 = math.log2(dataM1M2/dataM2)
logFC_mRNA_Node_M2toM12[node_name] = logFCM12M2
logFC_mRNA_Node_arr_M2toM12.append(logFCM12M2)
# FC from M12 to M2
logFC_mRNA_Node_M12toM2[node_name] = -logFCM12M2
logFC_mRNA_Node_arr_M12toM2.append(-logFCM12M2)
# FC from M2 to M1
logFCM1M2 = math.log2(dataM1/dataM2)
logFC_mRNA_Node_M2toM1[node_name] = logFCM1M2
logFC_mRNA_Node_arr_M2toM1.append(logFCM1M2)
# FC from M1 to M2
logFC_mRNA_Node_M1toM2[node_name] = -logFCM1M2
logFC_mRNA_Node_arr_M1toM2.append(-logFCM1M2)
i += 1
## Convert the mRNA based score on Node
M2_node_score = {}
per10M2 = np.percentile(M2_mRNA_Node_arr,10)
per50M2 = np.percentile(M2_mRNA_Node_arr,50)
per95M2 = np.percentile(M2_mRNA_Node_arr,95)
for key, value in M2_mRNA_Node.items():
raw_value = float(value)
if raw_value <= per10M2 :
M2_node_score[key] = 1
elif raw_value <= per50M2 and raw_value > per10M2:
M2_node_score[key] = 2
elif raw_value <= per95M2 and raw_value > per50M2:
M2_node_score[key] = 3
elif raw_value > per95M2:
M2_node_score[key] = 4
## Convert the mRNA based score on Node
M1_node_score = {}
per10M1 = np.percentile(M1_mRNA_Node_arr,10)
per50M1 = np.percentile(M1_mRNA_Node_arr,50)
per95M1 = np.percentile(M1_mRNA_Node_arr,95)
for key, value in M1_mRNA_Node.items():
raw_value = float(value)
if raw_value <= per10M1 :
M1_node_score[key] = 1
elif raw_value <= per50M1 and raw_value > per10M1:
M1_node_score[key] = 2
elif raw_value <= per95M1 and raw_value > per50M1:
M1_node_score[key] = 3
elif raw_value > per95M1:
M1_node_score[key] = 4
## Convert the mRNA based score on Node
M1M2_node_score = {}
per10M1M2 = np.percentile(M1M2_mRNA_Node_arr,10)
per50M1M2 = np.percentile(M1M2_mRNA_Node_arr,50)
per95M1M2 = np.percentile(M1M2_mRNA_Node_arr,95)
for key, value in M1M2_mRNA_Node.items():
raw_value = float(value)
if raw_value <= per10M1M2 :
M1M2_node_score[key] = 1
elif raw_value <= per50M1M2 and raw_value > per10M1M2:
M1M2_node_score[key] = 2
elif raw_value <= per95M1M2 and raw_value > per50M1M2:
M1M2_node_score[key] = 3
elif raw_value > per95M1M2:
M1M2_node_score[key] = 4
# Convert the mRNA based score on Node
# FC from M2 to M1M2
FC_node_score_M2toM12 = {}
for key, value in logFC_mRNA_Node_M2toM12.items():
raw_value = float(value)
if raw_value <= 0 :
FC_node_score_M2toM12[key] = -1 # This is Bad
else:
FC_node_score_M2toM12[key] = 1 # This is Good
# FC from M12 to M2
FC_node_score_M12toM2 = {}
for key, value in logFC_mRNA_Node_M12toM2.items():
raw_value = float(value)
if raw_value <= 0 :
FC_node_score_M12toM2[key] = -1 # This is Bad
else:
FC_node_score_M12toM2[key] = 1 # This is Good
# FC from M2 to M1
FC_node_score_M2toM1 = {}
for key, value in logFC_mRNA_Node_M2toM1.items():
raw_value = float(value)
if raw_value <= 0 :
FC_node_score_M2toM1[key] = -1 # This is Bad
else:
FC_node_score_M2toM1[key] = 1 # This is Good
# FC from M1 to M2
FC_node_score_M1toM2 = {}
for key, value in logFC_mRNA_Node_M1toM2.items():
raw_value = float(value)
if raw_value <= 0 :
FC_node_score_M1toM2[key] = -1 # This is Bad
else:
FC_node_score_M1toM2[key] = 1 # This is Good
# Scoring the edges #########################################
'''
This is where the magic happens
'''
edge_mRNA_score_M2toM12 = []
edge_mRNA_score_M12toM2 = []
edge_mRNA_score_M2toM1 = []
edge_mRNA_score_M1toM2 = []
for n in np.arange(num_of_edges):
feature = networkdata['edge_features'][n]
startnode = networkdata['start_nodes'][n]
endnode = networkdata['end_nodes'][n]
try: # by doing so, we are skipping some ligands that are not registered in mRNA seq **
startM2 = M2_node_score[startnode]
startM1 = M1_node_score[startnode]
startM12 = M1M2_node_score[startnode]
except:
startM2 = 0
startM1 = 0
startM12 = 0
try: # by doing so, we are skipping some ligands that are not registered in mRNA seq **
endM2 = M2_node_score[endnode]
endM1 = M1_node_score[endnode]
endM12 = M1M2_node_score[endnode]
except:
#pass
endM2 = 0
endM1 = 0
endM12 = 0
try: # by doing so, we are skipping some ligands that are not registered in mRNA seq **
startFC_M2toM12 = FC_node_score_M2toM12[startnode]
startFC_M12toM2 = FC_node_score_M12toM2[startnode]
startFC_M2toM1 = FC_node_score_M2toM1[startnode]
startFC_M1toM2 = FC_node_score_M1toM2[startnode]
except:
startFC_M2toM12 = 0
startFC_M12toM2 = 0
startFC_M1toM2 = 0
startFC_M2toM1 = 0
try: # by doing so, we are skipping some ligands that are not registered in mRNA seq **
endFC_M2toM12 = FC_node_score_M2toM12[endnode]
endFC_M12toM2 = FC_node_score_M12toM2[endnode]
endFC_M2toM1 = FC_node_score_M2toM1[endnode]
endFC_M1toM2 = FC_node_score_M1toM2[endnode]
except:
endFC_M2toM12 = 0
endFC_M12toM2 = 0
endFC_M1toM2 = 0
endFC_M2toM1 = 0
if feature == 'up-regulates':
score_M2toM12 = math.exp(-((startM2 + endM2)/2+startFC_M2toM12+endFC_M2toM12))
score_M12toM2 = math.exp(-((startM12 + endM12)/2+startFC_M12toM2+endFC_M12toM2))
score_M2toM1 = math.exp(-((startM2 + endM2)/2+startFC_M2toM1+endFC_M2toM1))
score_M1toM2 = math.exp(-((startM1 + endM1)/2+startFC_M1toM2+endFC_M1toM2))
elif feature == 'down-regulates':
score_M2toM12 = math.exp(((startM2 + endM2)/2+startFC_M2toM12+endFC_M2toM12)) + 100 # I could add 1000 but let's see how it goes
score_M12toM2 = math.exp(((startM12 + endM12)/2+startFC_M12toM2+endFC_M12toM2)) + 100 # I could add 1000 but let's see how it goes
score_M2toM1 = math.exp(((startM2 + endM2)/2+startFC_M2toM1+endFC_M2toM1)) + 100 # I could add 1000 but let's see how it goes
score_M1toM2 = math.exp(((startM1 + endM1)/2+startFC_M1toM2+endFC_M1toM2)) + 100 # I could add 1000 but let's see how it goes
edge_mRNA_score_M2toM12.append(score_M2toM12)
edge_mRNA_score_M12toM2.append(score_M12toM2)
edge_mRNA_score_M2toM1.append(score_M2toM1)
edge_mRNA_score_M1toM2.append(score_M1toM2)
return unique_all, networkdata, edge_mRNA_score_M2toM12, edge_mRNA_score_M12toM2, edge_mRNA_score_M2toM1, edge_mRNA_score_M1toM2
Generating network figure via NetworkX and perform analysis based on the weight assigned by the preceeding script
- The png file will be stored in your current working directory.
- In this particular example, the current working directory is assigned to be /content/pathwayanalysis
- The file name is ‘network_figure_XXXX.png’
def networkAnalaysis(unique_all, networkdata, edge_mRNA_score, figure_file_name, receptorlistFilename, destination,listofNodes_path):
## Drawing part
G2 = Digraph('unix', filename='fullpicture',format='png',
node_attr={'shape':'box',
'color': 'blue:purple',
'style': 'filled',
'fontcolor':'white'},
edge_attr={'color': 'red'})
for node in unique_all:
G2.node(node)
for n in np.arange(len(networkdata['start_nodes'])):
weight = edge_mRNA_score[n]
G2.edge(networkdata['start_nodes'][n],networkdata['end_nodes'][n],label=str(weight))
G2.view(figure_file_name)
## Actual Network Analysis
G = nx.DiGraph()
for node in unique_all:
G.add_node(node)
<<<<<<< HEAD
G.nodes()
for n in np.arange(len(networkdata['start_nodes'])):
weight = edge_mRNA_score[n]
G.add_edge(networkdata['start_nodes'][n],networkdata['end_nodes'][n],weight=weight)
G.edges()
Receptor_info = textreader(receptorlistFilename)
Receptor_name = Receptor_info['Receptors']
Receptor_feature = Receptor_info['Pro/Anti']
Outcome = destination
Scorelist = []
ReceptorList = []
ReceptorFeatureList = []
PathwayNodes = {}
i = 0
for R in Receptor_name:
try:
Path = nx.dijkstra_path(G,R,Outcome)
Score = nx.dijkstra_path_length(G,R,Outcome)
Scorelist.append(Score)
ReceptorList.append(R)
ReceptorFeatureList.append(Receptor_feature[i])
PathwayNodes[R] = Path
if listofNodes_path:
print('The shortest path from ',R,' (Upstream Receptor) to ',Outcome,' in a weighted Pathway Network: \n', Path,'\n with score of ',Score,'\n')
except:
pass
i += 1
collected_data = {'Receptors': ReceptorList, 'Pro/Anti': ReceptorFeatureList, 'Score': Scorelist}
return collected_data, PathwayNodes
=======
prot_exp_score += prot_exp_val
#print(p_name)
#print('-- Fold Change from Protein Expression: ',prot_exp_val)
else:
prot_exp_val = 0
pathscore = pathScore[Receptor_name]
if pathscore > 1000:
pathscore = -1000
else:
pathscore = np.log10(1/pathscore)
total = 0.1*M2Score + 10*PolarizeScore + pathscore + prot_exp_score*10
#print('- Pathway Score: ',total, 'pathway length: ',pathScore[Receptor_name])
#print('----END----')
return total
- The following is the way converting expression into score
total = 0.1*M2Score + 10*PolarizeScore + pathscore + prot_exp_score*10
- pathscore is either quite small (< 100) or big (> 1000) number. Small number indicates more favorable path than big score pathway because 1000 score is added to the pathway score if path is inhibitory. Therefore, we convert the score to -1000 if the original score is bigger than 1000.
- M2Score indicates the expression of M1 specific gene in M2 under the assumption that its abundance will be addition for M1 polarization.
- PolarizeScore indicates the expression change of M1 specific gene due to M1 MEV treatment (M1M2).
- prot_exp_score indicates the expression of gene in M2 over M1-derived vesicles (e.g. values >1 indicate expression is higher in M2s) nly selective proteins listed as nodes in selected-receptor mediated pathway.
Generating the final outcome.
def score_per_receptor(destination,
data_mrna,
receptor_list,
name_from_prot,
prot_data,
data_network,
node_names,
edge_weight_dict):
Receptors, pathDict, pathScore = analysis_prep(data_network,
receptor_list,
node_names,
edge_weight_dict,
destination)
collected_score = []
tested_receptors = []
#print(Receptors)
for receptor_name in Receptors:
try:
score = exp_checker(receptor_name,
data_mrna,
name_from_prot,
prot_data,
pathScore,
pathDict)
collected_score.append(score)
tested_receptors.append(receptor_name)
except:
pass
receptor_and_score = {'Receptors': tested_receptors, 'Score': collected_score}
#print(receptor_and_score)
return receptor_and_score, pathDict
>>>>>>> 0169e4e62426ccf5de24feb24c80020c08b81c59
Checking mRNA sequence and protein expression data against specific node
def expression_search(name_node,
data_mrna,
prot_data,
name_from_prot):
for name_prot in name_from_prot:
if name_node.lower() == name_prot.lower():
element_prot = prot_data[name_prot]
try:
if element_prot:
prot_exp = element_prot
except:
prot_exp = 'N/A'
for name_mrna in data_mrna['mgi_symbol']:
if name_node.lower() == name_mrna.lower():
n = data_mrna['mgi_symbol'].index(name_mrna)
expM2 = float(data_mrna['M2'][n])
expM1 = float(data_mrna['M1'][n])
expM1M2 = float(data_mrna['M1M2'][n])
compFC = math.log2(expM1M2/expM2)
try:
if n:
M1 = expM1
M2 = expM2
M1M2 = expM1M2
logFCofM1M2overM2 = compFC
except:
M1 = 'N/A'
M2 = 'N/A'
M1M2 = 'N/A'
logFCofM1M2overM2 = 'N/A'
return prot_exp, M1, M2, M1M2, logFCofM1M2overM2
What’s in pathwaySearch.py?
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from graphviz import Digraph
import utilities as util
import yaml
def runItAll(
mRNA_inputfilename = 'mRNAdata',
Prot_inputfilename = 'Ab_Chris',
network_inputfilename = 'network',
receptor_list = 'receptorlist',
ligand_list_file_name = 'ligandlist',
destination = 'M1_polarization',
unit_test_receptor = False,
display=True,
listofNodes_path=True,
):
##
## Misc
##
unique_all, networkdata, edge_mRNA_score_M2toM12, edge_mRNA_score_M12toM2, edge_mRNA_score_M2toM1, edge_mRNA_score_M1toM2 = util.prepare_process(network_inputfilename,mRNA_inputfilename)
if destination == 'M1_polarization':
collected_data1, PathwayNodes1 = util.networkAnalaysis(unique_all, networkdata, edge_mRNA_score_M2toM12, 'new_figure_target'+destination, receptor_list, destination,listofNodes_path)
collected_data2, PathwayNodes2 = util.networkAnalaysis(unique_all, networkdata, edge_mRNA_score_M2toM1, 'new_figure_control'+destination, receptor_list, destination,listofNodes_path)
elif destination == 'M2_polarization':
collected_data1, PathwayNodes1 = util.networkAnalaysis(unique_all, networkdata, edge_mRNA_score_M12toM2, 'new_figure_target'+destination, receptor_list, destination,listofNodes_path)
collected_data2, PathwayNodes2 = util.networkAnalaysis(unique_all, networkdata, edge_mRNA_score_M1toM2, 'new_figure_control'+destination, receptor_list, destination,listofNodes_path)
df_target = pd.DataFrame(collected_data1)
df_control = pd.DataFrame(collected_data2)
network_inputfilename = re.sub('\.sif','',network_inputfilename)
mRNA_inputfilename = re.sub('\.csv','',mRNA_inputfilename)
#####################################################
### Extracting Gene names from mRNA sequence data ###
#####################################################
data_mrna = util.reader(mRNA_inputfilename)
#########################################
### Extracting protein names and data ###
#########################################
data_prot, prot_name = util.protReader(Prot_inputfilename)
########################################################################
### Extracting Node names from pathway file generated from Cytoscape ###
########################################################################
data_network = util.reader(network_inputfilename,sif=True)
#########################################################
### Extracting Receptor node names from Receptor List ###
#########################################################
receptor_list = util.textreader(receptor_list)
##############################################################################
### Sorting nodes and edges ###
### and add the weight to each edge based on its unique feature ###
##############################################################################
node_names = np.unique(data_network['start_nodes']+data_network['end_nodes']) ## get unique node name
with open(ligand_list_file_name+'.yaml') as file:
ligand_list = yaml.load(file)
receptor_specifics = {}
ligand_specifics = {}
for R in receptor_list['Receptors']:
receptor_specifics[R] = {}
ligand_specifics[R] = {}
try:
list_node = PathwayNodes1[R]
for node in list_node:
receptor_specifics[R][node] = {}
prot_exp, M1, M2, M1M2, logFCofM1M2overM2 = util.expression_search(node,
data_mrna,
data_prot,
prot_name)
receptor_specifics[R][node]['mAb(From M1 to M2: M2/M1)'] = prot_exp
receptor_specifics[R][node]['M1_mRNA'] = M1
receptor_specifics[R][node]['M2_mRNA'] = M2
receptor_specifics[R][node]['M1M2_mRNA'] = M1M2
receptor_specifics[R][node]['logFC(M1M2/M2)'] = logFCofM1M2overM2
except:
pass
try:
list_ligand = ligand_list[R]
for ligand in list_ligand:
ligand_specifics[R][ligand] = {}
prot_exp, M1, M2, M1M2, logFCofM1M2overM2 = util.expression_search(ligand,
data_mrna,
data_prot,
prot_name)
ligand_specifics[R][ligand]['mAb(From M1 to M2: M2/M1)'] = prot_exp
ligand_specifics[R][ligand]['M1_mRNA'] = M1
ligand_specifics[R][ligand]['M2_mRNA'] = M2
ligand_specifics[R][ligand]['M1M2_mRNA'] = M1M2
ligand_specifics[R][ligand]['logFC(M1M2/M2)'] = logFCofM1M2overM2
except:
pass
return collected_data1, collected_data2, df_target, df_control, receptor_specifics, ligand_specifics
#!/usr/bin/env python
import sys
##################################
#
# Revisions
# 10.08.10 inception
#
##################################
#
# Message printed when program run without arguments
#
def helpmsg():
scriptName= sys.argv[0]
msg="""
Purpose:
Usage:
"""
msg+=" %s -run" % (scriptName)
msg+="""
Notes:
"""
return msg
#
# MAIN routine executed when launching this script from command line
#
if __name__ == "__main__":
import sys
msg = helpmsg()
remap = "none"
if len(sys.argv) < 2:
raise RuntimeError(msg)
#fileIn= sys.argv[1]
#if(len(sys.argv)==3):
# 1
# #print "arg"
# Loops over each argument in the command line
for i,arg in enumerate(sys.argv):
# calls 'doit' with the next argument following the argument '-validation'
if(arg=="-run"):
runItAll()
quit()
raise RuntimeError("Arguments not understood")