-
Notifications
You must be signed in to change notification settings - Fork 6
/
streamlit.py
484 lines (404 loc) · 16.2 KB
/
streamlit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
from typing import Dict, List
import streamlit as st
import pandas as pd
from io import StringIO
import re
from datetime import datetime
import numpy
from dnachisel import *
from Bio import Seq, SeqIO
from Bio.Restriction.Restriction_Dictionary import rest_dict
from streamlit_searchbox import st_searchbox
from thefuzz import process
from functools import reduce
from streamlit_modal import Modal
import streamlit.components.v1 as components
from st_keyup import st_keyup
# Note that this, which is gitignored, is also excluded from gcloud builds.
# For that reason, and cleanliness, that DB has been copied to a data/ directory.
COCOPUTS_DB_FNAME = "data/cocoput_table.tsv"
COCOPUTS_INDEX_FNAME = "data/cocoput_index.csv"
@st.experimental_singleton
def get_cocoput_organism_index():
"""Load an index of all organisms available in the CoCoPuts database."""
df = pd.read_csv(COCOPUTS_INDEX_FNAME, index_col=False)
return pd.Series(
df.apply(lambda r: f"{r['Species']} (TaxID: {r['Taxid']})", axis=1)
.unique()
)
@st.experimental_singleton
def get_cocoput_organism_list():
"""Return the index as a list."""
return get_cocoput_organism_index().tolist()
def search_organisms(searchterm) -> List[str]:
"""Search the cocoput index for the given search term."""
print(f'Searching for {searchterm}', flush=True)
matches = get_cocoput_organism_index()
matches = matches[reduce((lambda a, b: a & b), [matches.str.lower().str.contains(t.lower()) for t in searchterm.split()])]
if matches.shape[0] > 100:
matches = matches.iloc[:100]
return matches.tolist()
def get_taxid_from_cocoput_name(cocoput_name):
"""Extract the TaxID from the cocoput name, which by convention ends with '(TaxID: ###)'."""
rematch = re.match(r".*\(TaxID: (\d+)\)", cocoput_name)
assert rematch, f"Somehow the cocoput name was poorly formatted, {cocoput_name}"
return int(rematch.groups()[0])
AA_TO_CODON: Dict[str, List[str]] = {
"*": ["TAA", "TAG", "TGA"],
"A": ["GCA", "GCC", "GCG", "GCT"],
"C": ["TGC", "TGT"],
"D": ["GAC", "GAT"],
"E": ["GAA", "GAG"],
"F": ["TTC", "TTT"],
"G": ["GGA", "GGC", "GGG", "GGT"],
"H": ["CAC", "CAT"],
"I": ["ATA", "ATC", "ATT"],
"K": ["AAA", "AAG"],
"L": ["CTA", "CTC", "CTG", "CTT", "TTA", "TTG"],
"M": ["ATG"],
"N": ["AAC", "AAT"],
"P": ["CCA", "CCC", "CCG", "CCT"],
"Q": ["CAA", "CAG"],
"R": ["AGA", "AGG", "CGA", "CGC", "CGG", "CGT"],
"S": ["AGC", "AGT", "TCA", "TCC", "TCG", "TCT"],
"T": ["ACA", "ACC", "ACG", "ACT"],
"V": ["GTA", "GTC", "GTG", "GTT"],
"W": ["TGG"],
"Y": ["TAC", "TAT"],
}
def convert_cocoputs_table_to_dnachisel(
codon_table_counts: dict,
) -> Dict[str, Dict[str, float]]:
"""Convert the CoCoPuts codon usage table to a DNAChisel codon usage table."""
new_codon_table: Dict[str, Dict[str, float]] = {}
for aa in AA_TO_CODON:
new_codon_table[aa] = {}
codon_sum: int = sum([codon_table_counts[codon] for codon in AA_TO_CODON[aa]])
for codon in AA_TO_CODON[aa]:
new_codon_table[aa][codon] = round(codon_table_counts[codon] / codon_sum, 3)
return new_codon_table
@st.experimental_memo
def get_codon_table_for_taxid(taxid):
"""Return the CoCoPuts codon usage table for the given TaxID."""
df = pd.read_csv(COCOPUTS_DB_FNAME, sep="\t", index_col=False)
subset = df[df.Taxid == taxid]
row = subset[subset["# CDS"] == subset["# CDS"].max()].iloc[0]
codons = [a + b + c for a in "ATCG" for b in "ATCG" for c in "ATCG"]
codon_table = {k: v for k, v in row.to_dict().items() if k in codons}
return convert_cocoputs_table_to_dnachisel(codon_table)
st.set_page_config(page_title="BaseBuddy")
c1, c2, c3 = st.columns((1, 5, 1))
with c2:
st.image("resources/logo/png/logo-no-background.png") # , width=500
st.write("""
**Instructions:**
**1.) Choose your desired optimization method.**
**2.) Specify your target organism.**
**3.) Paste your coding sequence(s) into the textbox below and press Enter or upload a (multi-)FASTA file.**
**4.) Click the "Download Result(s)" button to generate a FASTA file with your optimized sequence(s).**
Check out the advanced settings if you have special requirements for your optimized sequence
(e.g. avoiding certain cut sites or patterns, or lowering synthesis difficulty).
For further questions or concerns please visit https:/JBEI/basebuddy
"""
)
footer = """
<style>
a:link , a:visited {
color: blue;
background-color: transparent;
text-decoration: underline;
}
a:hover, a:active {
color: red;
background-color: transparent;
text-decoration: underline;
}
.footer {
position: fixed;
left: 0;
bottom: 0;
width: 100%;
background-color: white;
color: black;
text-align: center;
}
</style>
<div class="footer">
<i>Powered by <a href="https://edinburgh-genome-foundry.github.io/DnaChisel/">DNA Chisel</a> and <a href="https://dnahive.fda.gov/dna.cgi?cmd=cuts_main">CoCoPUTs</a>.</i>
</div>
"""
st.markdown(footer, unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
optimization_method = st.radio(
"**Optimization Method**",
["use_best_codon", "match_codon_usage", "harmonize_rca"],
key="visibility",
help='Choose a codon optimization method. See details on the [DNAChisel website](https://edinburgh-genome-foundry.github.io/DnaChisel/ref/builtin_specifications.html#codon-optimization-specifications).'
)
database = st.radio(
"**Codon Usage Database**",
["CoCoPUTs", "Kazusa"],
key="database",
help='Choose a codon usage table database. CoCoPUTs is considered more accurate and up-to-date.'
)
with col2:
st.caption('**Target Organism**')
target_organism = (st_searchbox(
search_organisms,
key="target_searchbox",
) or "")
target_taxid = get_taxid_from_cocoput_name(target_organism) if target_organism else None
target_coding_table = get_codon_table_for_taxid(target_taxid) if target_taxid else None
if optimization_method == "harmonize_rca":
st.caption("**Source Organism**")
source_organism = (st_searchbox(
search_organisms,
key="source_organism",
) or "")
source_taxid = get_taxid_from_cocoput_name(source_organism) if source_organism else None
source_coding_table = get_codon_table_for_taxid(source_taxid) if source_taxid else None
else:
source_coding_table = None
paste_tab, upload_tab = st.tabs(['Paste', 'FASTA Upload'])
with paste_tab:
original_fasta_str = st.text_area(
"**Insert your native nucleotide sequence(s) in FASTA format:**",
">example_sequence1\nATGAGTAGT\n>example_sequence2\nATGGTGAATTTG",
help=f"It is important to only use the native coding sequence because some optimization methods calculate the relative codon adaptation (RCA) based on the inserted sequence.",
)
with upload_tab:
uploaded_fasta_file = st.file_uploader(
"**Upload your (multi-)FASTA file here:**", type=[".fasta",".fa"]
)
if uploaded_fasta_file is not None:
text_io = uploaded_fasta_file.read().decode("UTF-8")
original_fasta_str = text_io
with StringIO(original_fasta_str) as fio:
records = list(SeqIO.parse(fio, "fasta"))
if len(records) == 0:
st.warning(f"Found zero valid records in textbox, should have one.")
st.stop()
def process_substring(substring):
# Convert substring to uppercase, strip whitespace
return substring.strip().upper()
def validate_input_string(input_string):
allowed_chars = {'A', 'T', 'G', 'C', ','}
for char in input_string:
if char.upper() not in allowed_chars and char != ',' and not char.isspace():
return False
return True
# A list of constraints to impose, including cut site constraints.
cut_site_constraints = []
custom_pattern_constraints = []
with st.expander("Advanced Settings"):
homopolycols = st.columns(4)
with homopolycols[0]:
poly_a_maxlength = st.number_input(
"Avoid poly As",
value=9,
min_value=1,
max_value=15,
step=1,
help="Set the allowed maximum length of consecutive As.",
)
with homopolycols[1]:
poly_t_maxlength = st.number_input(
"Avoid poly Ts",
value=9,
min_value=1,
max_value=15,
step=1,
help="Set the allowed maximum length of consecutive Ts.",
)
with homopolycols[2]:
poly_c_maxlength = st.number_input(
"Avoid poly Cs",
value=6,
min_value=1,
max_value=15,
step=1,
help="Set the allowed maximum length of consecutive Cs.",
)
with homopolycols[3]:
poly_g_maxlength = st.number_input(
"Avoid poly Gs",
value=6,
min_value=1,
max_value=15,
step=1,
help="Set the allowed maximum length of consecutive Gs.",
)
hairpin_c1, hairpin_c2 = st.columns((1, 1))
with hairpin_c1:
hairpin_stem_size = st.number_input(
"Hairpin Stem Size",
value=10,
min_value=1,
max_value=100,
step=1,
help="Set the allowed maximum hairpin stem size.",
)
with hairpin_c2:
hairpin_window = st.number_input(
"Hairpin Window",
value=100,
min_value=50,
max_value=500,
step=1,
help="Define the minimum distance between hairpins.",
)
enforce_gc = st.columns(3)
with enforce_gc[0]:
gc_minimum = st.number_input(
"Min. GC-content",
value=0.3,
min_value=0.1,
max_value=0.9,
step=0.01,
help="Define the minium GC-content in a given window."
)
with enforce_gc[1]:
gc_maximum = st.number_input(
"Max. GC-content",
value=0.75,
min_value=0.1,
max_value=0.9,
step=0.01,
help="Define the maximum GC-content in a given window."
)
with enforce_gc[2]:
gc_window = st.number_input(
"GC-content Window",
value=50,
min_value=10,
max_value=200,
step=1,
help="Define the window to enforce the set GC-content."
)
restriction_sites = st.multiselect(
"Avoid cut sites",
options=rest_dict,
default=['BamHI', 'NdeI', 'XhoI', 'SpeI', 'BsaI']
)
for restricition_site in restriction_sites:
cut_site_constraints.append(AvoidPattern(restricition_site+"_site"))
# Get Avoid custom pattern string from the user
input_string = st.text_input(
"Avoid custom pattern(s)",
placeholder="e.g. atta, gggtttaaa, ...",
help='''Adding a single DNA sequence or multiple sequences divided by commas will avoid that specific sequence pattern on both strands. Only As, Ts, Gs, Cs, or commas are allowed (not case-sensitive). For example, the input:"atta, agggt" will avoid the sequence pattern "ATTA" and "AGGGT" in the optimized output.'''
)
if input_string.strip(): # Check if input_string is not empty after stripping whitespace
if validate_input_string(input_string):
# Split the input string by commas
input_sequences = input_string.split(',')
# Process each input sequence and create constraints
for sequence in input_sequences:
processed_sequence = process_substring(sequence)
if processed_sequence: # Check if the processed sequence is non-empty
custom_pattern_constraints.append(AvoidPattern(processed_sequence)) # Assuming AvoidPattern accepts the processed sequence
else:
st.warning("Error: Input string contains invalid characters. Only 'As', 'Ts', 'Gs', 'Cs', or commas are allowed.")
uniquify_kmers = st.columns(2)
with uniquify_kmers[0]:
kmers_value = st.number_input(
"Uniquify All K-mers",
value=10,
min_value=1,
max_value=20,
step=1,
help="This will ensure lower DNA synthesis diffculty due to less repetitive codons. Changing to a lower value might further lower synthesis difficulty but can also result in an unsolvable constraint."
)
with uniquify_kmers[1]:
kmers_rev = st.checkbox(
"Include reverse complement",
value=True,
help="Uniquify the k-mers of the reverse complement as well."
)
randomize_numpy_seed = st.checkbox(
"Numpy random generator",
value=False,
help="This ensures reproducible results. With this unchecked, an identical input sequence will always result in an identical output. However, checking this should not affect codon optimization result."
)
# Do some input validation.
if not target_coding_table:
st.warning("Please specify a target organism.")
st.stop()
if database == 'CoCoPUTs':
codon_optimize_kwargs = {
"codon_usage_table": target_coding_table
}
if optimization_method == "harmonize_rca":
if not source_coding_table:
st.warning("Please specify a source organism if using the harmonize_rca method.")
st.stop()
codon_optimize_kwargs["original_codon_usage_table"] = source_coding_table
elif database == 'Kazusa':
codon_optimize_kwargs = {
"species": target_taxid
}
if optimization_method == "harmonize_rca":
if not source_taxid:
st.warning("Please specify a source organism if using the harmonize_rca method.")
st.stop()
codon_optimize_kwargs["original_species"] = source_taxid
else:
raise KeyError(f'Unrecognized database {database}. Maybe the database dropdown was reformatted?')
constraints_logs = []
objectives_logs = []
recodings = []
try:
for record in records:
if randomize_numpy_seed:
numpy.random.seed(None)
else:
numpy.random.seed(123)
problem = DnaOptimizationProblem(
sequence=record,
constraints=[
UniquifyAllKmers(kmers_value, include_reverse_complement=kmers_rev),
AvoidHairpins(
stem_size=hairpin_stem_size, hairpin_window=hairpin_window
),
AvoidPattern(str(poly_a_maxlength) + "xA"),
AvoidPattern(str(poly_t_maxlength) + "xT"),
AvoidPattern(str(poly_c_maxlength) + "xC"),
AvoidPattern(str(poly_g_maxlength) + "xG"),
EnforceGCContent(mini=gc_minimum, maxi=gc_maximum, window=gc_window),
EnforceTranslation(),
] + cut_site_constraints + custom_pattern_constraints,
objectives=[
CodonOptimize(
method=optimization_method,
**codon_optimize_kwargs,
)
],
)
problem.max_random_iters = 10000
problem.resolve_constraints()
problem.optimize()
constraints_logs.append(problem.constraints_text_summary())
objectives_logs.append(problem.objectives_text_summary())
recodings.append(problem.sequence)
except Exception as e:
st.warning(e)
st.stop()
result_list=[]
for record, recoding in zip(records, recodings):
if optimization_method == 'harmonize_rca':
notes = f'method: {optimization_method}, source_taxid: {source_taxid}, target_taxid: {target_taxid}'
else:
notes = f'method: {optimization_method}, target_taxid: {target_taxid}'
record_result = f">{record.id} ({notes}) Please cite us :) \n{recoding}\n"
result_list.append(record_result)
st.download_button(label="**Download result(s)**",
data = "".join(str(j) for j in result_list),
file_name = datetime.now().strftime("%Y%m%d-%I%M%S%p_") + "BaseBuddy_results" + ".fasta",
)
st.text_area(
"Or copy your result(s) from this textbox:","".join(str(j) for j in result_list)
)
with st.expander("Optimization Logs"):
for log in constraints_logs + objectives_logs:
st.text(log)