-
Notifications
You must be signed in to change notification settings - Fork 21
/
postanalysis_path.py
142 lines (124 loc) · 5.47 KB
/
postanalysis_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import argparse
import networkx as nx
parser = argparse.ArgumentParser(
'Find shortest paths along domains in residues.')
parser.add_argument('--num-residues', type=int, default=77,
help='Number of residues of the PDB.')
parser.add_argument('--windowsize', type=int, default=56,
help='window size')
parser.add_argument('--dist-threshold', type=int, default=12,
help='threshold for shortest distance')
parser.add_argument('--filename', type=str, default='logs/out_probs_train.npy',
help='File name of the probs file.')
parser.add_argument('--source-node', type=int, default=46,
help='source residue of the PDB')
parser.add_argument('--outputfilename', type=str, default='logs/source46.txt',
help='File of shortest path from source to targets')
args = parser.parse_args()
# According to the distribution of learned edges between residues, we calculated the shortest path
# from mutation site to the residues in the active loop.
def getEdgeResults(threshold=False):
a = np.load(args.filename)
b = a[:, :, 1]
c = a[:, :, 2]
d = a[:, :, 3]
# There are four types of edges, eliminate the first type as the non-edge
probs = b+c+d
# For default residue number 77, residueR2 = 77*(77-1)=5852
residueR2 = args.num_residues*(args.num_residues-1)
probs = np.reshape(probs, (args.windowsize, residueR2))
# Calculate the occurence of edges
edges_train = probs/args.windowsize
results = np.zeros((residueR2))
for i in range(args.windowsize):
results = results+edges_train[i, :]
if threshold:
# threshold, default 0.6
index = results < (args.threshold)
results[index] = 0
# Calculate prob for figures
edges_results = np.zeros((args.num_residues, args.num_residues))
count = 0
for i in range(args.num_residues):
for j in range(args.num_residues):
if not i == j:
edges_results[i, j] = results[count]
count += 1
else:
edges_results[i, j] = 0
return edges_results
def dist_cal(AA1, AA2):
"""
calculate CA-CA distance
"""
dist = np.sqrt(np.square(
AA1['x'] - AA2['x']) + np.square(AA1['y'] - AA2['y']) + np.square(AA1['z'] - AA2['z']))
return dist
# Load distribution of learned edges
edges_results = getEdgeResults()
dists_matrix = list()
for i in range(1, 2):
tmp_matrix = np.zeros((args.num_residues, args.num_residues))
df = pd.read_csv('data/pdb/ca_%d.pdb' % i, sep='\s+', names=[
'ATOM', 'n1', 'CA', 'AA', 'Chain', 'n2', 'x', 'y', 'z', 'n3', 'n4', 'C']) # temparory DataFrame
tdf = df.iloc[1:(args.num_residues+1)]
for ind_1 in range(args.num_residues-1):
for ind_2 in range(ind_1+1, args.num_residues):
AA1 = tdf.iloc[ind_1]
AA2 = tdf.iloc[ind_2]
tmp_dist = dist_cal(AA1, AA2)
tmp_matrix[ind_1, ind_2] = tmp_dist
tmp_matrix[ind_2, ind_1] = tmp_dist
dists_matrix.append(tmp_matrix)
dists_matrix = np.array(dists_matrix)
dists_mean = np.zeros((args.num_residues, args.num_residues))
for ind_1 in range(args.num_residues-1):
for ind_2 in range(ind_1+1, args.num_residues):
dists_mean[ind_1, ind_2] = dists_matrix[:, ind_1, ind_2].mean()
dists_mean[ind_2, ind_1] = dists_matrix[:, ind_2, ind_1].mean()
np.save('logs/dists_mean_12.npy', dists_mean)
# if the distance is larer than 12, then ignore
filtered_edges = np.where(dists_mean > args.dist_threshold, 1, edges_results)
np.save('logs/filtered_edges_12.npy', filtered_edges)
edges = np.load('logs/filtered_edges_12.npy')
# The network is directed
edges_list = list()
# Default: i->j
for i in range(args.num_residues):
for j in range(args.num_residues):
if i != j:
edges_list.append((i, j, {'weight': filtered_edges[j, i]}))
MDG = nx.MultiDiGraph()
MDG.add_edges_from(edges_list)
source_node = args.source_node # set source node
target_nodes = [61, 62, 63, 64, 65, 66, 67, 68, 69, 70] # set target nodes
out_file = args.outputfilename
path_dict = dict()
for tn in target_nodes:
path_dict[tn] = []
path_nodes = nx.dijkstra_path(MDG, source_node, tn)
path_length_list = [] # save the length of shorest path
path_length_list.append(nx.dijkstra_path_length(MDG, source_node, tn))
path_dict[tn].append('shortest_path : ' + '->'.join(list(map(str, path_nodes))) +
' : ' + str(nx.dijkstra_path_length(MDG, source_node, tn)))
if len(path_nodes) > 2:
for ipn in range(1, len(path_nodes)):
tmp_MDG = MDG.copy()
tmp_MDG.remove_edge(path_nodes[ipn-1], path_nodes[ipn])
tmp_path_nodes = nx.dijkstra_path(tmp_MDG, source_node, tn)
path_length_list.append(
nx.dijkstra_path_length(tmp_MDG, source_node, tn))
path_dict[tn].append('remove(%d->%d) : ' % (path_nodes[ipn-1], path_nodes[ipn]) + '->'.join(
list(map(str, tmp_path_nodes))) + ' : ' + str(nx.dijkstra_path_length(tmp_MDG, source_node, tn)))
# According to the shortest path length from small to large
path_dict[tn] = np.array(path_dict[tn])[np.argsort(path_length_list)]
with open(out_file, 'w') as f:
for k, v in path_dict.items():
f.write('target node : %d\n' % k)
for tmp_path in v:
f.write('\t\t' + tmp_path + '\n')