Spaces:
Sleeping
Sleeping
supercat666
commited on
Commit
·
0d0c645
1
Parent(s):
fc0071d
added cas9 off
Browse files- .idea/.gitignore +3 -0
- .idea/CRISPRTool.iml +10 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- app.py +148 -85
- cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json +1 -0
- cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5 +3 -0
- cas9_model/on-cla.h5 +3 -0
- cas9off.py +119 -0
- cas9on.py +78 -0
- tiger.md → crisprTool.md +0 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# 默认忽略的文件
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/CRISPRTool.iml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
6 |
+
</content>
|
7 |
+
<orderEntry type="inheritedJdk" />
|
8 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
9 |
+
</component>
|
10 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (CRISPRTool)" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/CRISPRTool.iml" filepath="$PROJECT_DIR$/.idea/CRISPRTool.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
app.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import os
|
2 |
import tiger
|
|
|
|
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
5 |
from pathlib import Path
|
6 |
|
7 |
# title and documentation
|
8 |
-
st.markdown(Path('
|
9 |
st.divider()
|
10 |
|
11 |
CRISPR_MODELS = ['Cas9', 'Cas12', 'Cas13d']
|
@@ -13,107 +15,169 @@ CRISPR_MODELS = ['Cas9', 'Cas12', 'Cas13d']
|
|
13 |
selected_model = st.selectbox('Select CRISPR model:', CRISPR_MODELS, key='selected_model')
|
14 |
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
# Use a radio button to select enzymes, making sure only one can be selected at a time
|
19 |
-
enzyme_selection = st.radio(
|
20 |
-
"Select an enzyme:",
|
21 |
-
('SPCas9_U6', 'SPCas9_t7', 'eSPCas9', 'SPCas9_HF1'),
|
22 |
-
key='enzyme_selection'
|
23 |
-
)
|
24 |
-
|
25 |
-
# Actions based on the selected enzyme
|
26 |
-
if enzyme_selection == 'SPCas9_U6':
|
27 |
-
# Placeholder for action when SPCas9_U6 is selected
|
28 |
-
pass
|
29 |
-
elif enzyme_selection == 'SPCas9_t7':
|
30 |
-
# Placeholder for action when SPCas9_t7 is selected
|
31 |
-
pass
|
32 |
-
elif enzyme_selection == 'eSPCas9':
|
33 |
-
# Placeholder for action when eSPCas9 is selected
|
34 |
-
pass
|
35 |
-
elif enzyme_selection == 'SPCas9_HF1':
|
36 |
-
# Placeholder for action when SPCas9_HF1 is selected
|
37 |
-
pass
|
38 |
-
elif selected_model == 'Cas12':
|
39 |
-
# Placeholder for Cas12 model loading
|
40 |
-
# TODO: Implement Cas12 model loading logic
|
41 |
-
raise NotImplementedError("Cas12 model loading not implemented yet.")
|
42 |
-
elif selected_model == 'Cas13d':
|
43 |
-
ENTRY_METHODS = dict(
|
44 |
-
manual='Manual entry of single transcript',
|
45 |
-
fasta="Fasta file upload (supports multiple transcripts if they have unique ID's)"
|
46 |
-
)
|
47 |
-
@st.cache_data
|
48 |
-
def convert_df(df):
|
49 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
50 |
return df.to_csv().encode('utf-8')
|
51 |
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
|
|
|
|
|
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
st.progress(percent_complete / 100)
|
65 |
|
|
|
|
|
|
|
66 |
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
st.session_state.input_error = None
|
72 |
-
st.session_state.on_target = None
|
73 |
-
st.session_state.titration = None
|
74 |
-
st.session_state.off_target = None
|
75 |
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
if st.session_state.entry_method == ENTRY_METHODS['manual']:
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
with open(fasta_path, 'w') as f:
|
91 |
-
f.write(st.session_state.fasta_entry.getvalue().decode('utf-8'))
|
92 |
-
transcripts = tiger.load_transcripts([fasta_path], enforce_unique_ids=False)
|
93 |
-
os.remove(fasta_path)
|
94 |
-
|
95 |
-
# convert to upper case as used by tokenizer
|
96 |
-
transcripts[tiger.SEQ_COL] = transcripts[tiger.SEQ_COL].apply(lambda s: s.upper().replace('U', 'T'))
|
97 |
-
|
98 |
-
# ensure all transcripts have unique identifiers
|
99 |
-
if transcripts.index.has_duplicates:
|
100 |
-
st.session_state.input_error = "Duplicate transcript ID's detected in fasta file"
|
101 |
|
102 |
-
#
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
105 |
|
106 |
-
#
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
#
|
111 |
-
|
112 |
-
st.session_state.
|
|
|
|
|
|
|
|
|
|
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
if __name__ == '__main__':
|
116 |
-
|
117 |
# app initialization
|
118 |
if 'mode' not in st.session_state:
|
119 |
st.session_state.mode = tiger.RUN_MODES['all']
|
@@ -235,5 +299,4 @@ elif selected_model == 'Cas13d':
|
|
235 |
)
|
236 |
st.session_state.transcripts = None
|
237 |
st.experimental_rerun()
|
238 |
-
|
239 |
-
raise ValueError(f"Unknown model: {model_name}")
|
|
|
1 |
import os
|
2 |
import tiger
|
3 |
+
import cas9on
|
4 |
+
import cas9off
|
5 |
import pandas as pd
|
6 |
import streamlit as st
|
7 |
from pathlib import Path
|
8 |
|
9 |
# title and documentation
|
10 |
+
st.markdown(Path('crisprTool.md').read_text(), unsafe_allow_html=True)
|
11 |
st.divider()
|
12 |
|
13 |
CRISPR_MODELS = ['Cas9', 'Cas12', 'Cas13d']
|
|
|
15 |
selected_model = st.selectbox('Select CRISPR model:', CRISPR_MODELS, key='selected_model')
|
16 |
|
17 |
|
18 |
+
@st.cache_data
|
19 |
+
def convert_df(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
21 |
return df.to_csv().encode('utf-8')
|
22 |
|
23 |
|
24 |
+
def mode_change_callback():
|
25 |
+
if st.session_state.mode in {tiger.RUN_MODES['all'], tiger.RUN_MODES['titration']}: # TODO: support titration
|
26 |
+
st.session_state.check_off_targets = False
|
27 |
+
st.session_state.disable_off_target_checkbox = True
|
28 |
+
else:
|
29 |
+
st.session_state.disable_off_target_checkbox = False
|
30 |
+
|
31 |
+
|
32 |
+
def progress_update(update_text, percent_complete):
|
33 |
+
with progress.container():
|
34 |
+
st.write(update_text)
|
35 |
+
st.progress(percent_complete / 100)
|
36 |
+
|
37 |
+
|
38 |
+
def initiate_run():
|
39 |
+
# initialize state variables
|
40 |
+
st.session_state.transcripts = None
|
41 |
+
st.session_state.input_error = None
|
42 |
+
st.session_state.on_target = None
|
43 |
+
st.session_state.titration = None
|
44 |
+
st.session_state.off_target = None
|
45 |
+
|
46 |
+
# initialize transcript DataFrame
|
47 |
+
transcripts = pd.DataFrame(columns=[tiger.ID_COL, tiger.SEQ_COL])
|
48 |
+
|
49 |
+
# manual entry
|
50 |
+
if st.session_state.entry_method == ENTRY_METHODS['manual']:
|
51 |
+
transcripts = pd.DataFrame({
|
52 |
+
tiger.ID_COL: ['ManualEntry'],
|
53 |
+
tiger.SEQ_COL: [st.session_state.manual_entry]
|
54 |
+
}).set_index(tiger.ID_COL)
|
55 |
+
|
56 |
+
# fasta file upload
|
57 |
+
elif st.session_state.entry_method == ENTRY_METHODS['fasta']:
|
58 |
+
if st.session_state.fasta_entry is not None:
|
59 |
+
fasta_path = st.session_state.fasta_entry.name
|
60 |
+
with open(fasta_path, 'w') as f:
|
61 |
+
f.write(st.session_state.fasta_entry.getvalue().decode('utf-8'))
|
62 |
+
transcripts = tiger.load_transcripts([fasta_path], enforce_unique_ids=False)
|
63 |
+
os.remove(fasta_path)
|
64 |
+
|
65 |
+
# convert to upper case as used by tokenizer
|
66 |
+
transcripts[tiger.SEQ_COL] = transcripts[tiger.SEQ_COL].apply(lambda s: s.upper().replace('U', 'T'))
|
67 |
+
|
68 |
+
# ensure all transcripts have unique identifiers
|
69 |
+
if transcripts.index.has_duplicates:
|
70 |
+
st.session_state.input_error = "Duplicate transcript ID's detected in fasta file"
|
71 |
|
72 |
+
# ensure all transcripts only contain nucleotides A, C, G, T, and wildcard N
|
73 |
+
elif not all(transcripts[tiger.SEQ_COL].apply(lambda s: set(s).issubset(tiger.NUCLEOTIDE_TOKENS.keys()))):
|
74 |
+
st.session_state.input_error = 'Transcript(s) must only contain upper or lower case A, C, G, and Ts or Us'
|
75 |
|
76 |
+
# ensure all transcripts satisfy length requirements
|
77 |
+
elif any(transcripts[tiger.SEQ_COL].apply(lambda s: len(s) < tiger.TARGET_LEN)):
|
78 |
+
st.session_state.input_error = 'Transcript(s) must be at least {:d} bases.'.format(tiger.TARGET_LEN)
|
|
|
79 |
|
80 |
+
# run model if we have any transcripts
|
81 |
+
elif len(transcripts) > 0:
|
82 |
+
st.session_state.transcripts = transcripts
|
83 |
|
84 |
+
# Check if the selected model is Cas9
|
85 |
+
if selected_model == 'Cas9':
|
86 |
+
# Use a radio button to select enzymes, making sure only one can be selected at a time
|
87 |
+
target_selection = st.radio(
|
88 |
+
"Select either on-target or off-target:",
|
89 |
+
('on-target', 'off-target'),
|
90 |
+
key='target_selection'
|
91 |
+
)
|
92 |
|
93 |
+
# Actions based on the selected enzyme
|
94 |
+
if target_selection == 'on-target':
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
pass
|
97 |
+
elif target_selection == 'off-target':
|
98 |
+
ENTRY_METHODS = dict(
|
99 |
+
manual='Manual entry of target sequence',
|
100 |
+
txt="txt file upload"
|
101 |
+
)
|
102 |
+
if __name__ == '__main__':
|
103 |
+
# app initialization for Cas9 off-target
|
104 |
+
if 'target_sequence' not in st.session_state:
|
105 |
+
st.session_state.target_sequence = None
|
106 |
+
if 'input_error' not in st.session_state:
|
107 |
+
st.session_state.input_error = None
|
108 |
+
if 'off_target_results' not in st.session_state:
|
109 |
+
st.session_state.off_target_results = None
|
110 |
|
111 |
+
# target sequence entry
|
112 |
+
st.selectbox(
|
113 |
+
label='How would you like to provide target sequences?',
|
114 |
+
options=ENTRY_METHODS.values(),
|
115 |
+
key='entry_method',
|
116 |
+
disabled=st.session_state.target_sequence is not None
|
117 |
+
)
|
118 |
if st.session_state.entry_method == ENTRY_METHODS['manual']:
|
119 |
+
st.text_input(
|
120 |
+
label='Enter on/off sequences:',
|
121 |
+
key='manual_entry',
|
122 |
+
placeholder='Enter on/off sequences like:GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG',
|
123 |
+
disabled=st.session_state.target_sequence is not None
|
124 |
+
)
|
125 |
+
elif st.session_state.entry_method == ENTRY_METHODS['txt']:
|
126 |
+
st.file_uploader(
|
127 |
+
label='Upload a txt file:',
|
128 |
+
key='txt_entry',
|
129 |
+
disabled=st.session_state.target_sequence is not None
|
130 |
+
)
|
131 |
|
132 |
+
# prediction button
|
133 |
+
st.button(label='Predict off-target effects', on_click=cas9off.CRISPR_net_predict,
|
134 |
+
disabled=st.session_state.target_sequence is not None)
|
135 |
+
progress = st.empty()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
# input error display
|
138 |
+
error = st.empty()
|
139 |
+
if st.session_state.input_error is not None:
|
140 |
+
error.error(st.session_state.input_error, icon="🚨")
|
141 |
+
else:
|
142 |
+
error.empty()
|
143 |
|
144 |
+
# off-target results display
|
145 |
+
off_target_results = st.empty()
|
146 |
+
if st.session_state.off_target_results is not None:
|
147 |
+
with off_target_results.container():
|
148 |
+
if len(st.session_state.off_target_results) > 0:
|
149 |
+
st.write('Off-target predictions:', st.session_state.off_target_results)
|
150 |
+
st.download_button(
|
151 |
+
label='Download off-target predictions',
|
152 |
+
data=convert_df(st.session_state.off_target_results),
|
153 |
+
file_name='off_target_results.csv',
|
154 |
+
mime='text/csv'
|
155 |
+
)
|
156 |
+
else:
|
157 |
+
st.write('No significant off-target effects detected!')
|
158 |
+
else:
|
159 |
+
off_target_results.empty()
|
160 |
|
161 |
+
# running the CRISPR-Net model for off-target predictions
|
162 |
+
if st.session_state.target_sequence is not None:
|
163 |
+
st.session_state.off_target_results = cas9off.predict_off_targets(
|
164 |
+
target_sequence=st.session_state.target_sequence,
|
165 |
+
status_update_fn=progress_update
|
166 |
+
)
|
167 |
+
st.session_state.target_sequence = None
|
168 |
+
st.experimental_rerun()
|
169 |
|
170 |
+
elif selected_model == 'Cas12':
|
171 |
+
# Placeholder for Cas12 model loading
|
172 |
+
# TODO: Implement Cas12 model loading logic
|
173 |
+
raise NotImplementedError("Cas12 model loading not implemented yet.")
|
174 |
+
elif selected_model == 'Cas13d':
|
175 |
+
ENTRY_METHODS = dict(
|
176 |
+
manual='Manual entry of single transcript',
|
177 |
+
fasta="Fasta file upload (supports multiple transcripts if they have unique ID's)"
|
178 |
+
)
|
179 |
|
180 |
if __name__ == '__main__':
|
|
|
181 |
# app initialization
|
182 |
if 'mode' not in st.session_state:
|
183 |
st.session_state.mode = tiger.RUN_MODES['all']
|
|
|
299 |
)
|
300 |
st.session_state.transcripts = None
|
301 |
st.experimental_rerun()
|
302 |
+
|
|
cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "main_input", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 1, 24, 7], "dtype": "float32", "sparse": false, "name": "main_input"}, "inbound_nodes": []}, {"name": "conv2d_1", "class_name": "Conv2D", "config": {"name": "conv2d_1", "trainable": true, "filters": 10, "kernel_size": [1, 1], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "conv2d_2", "class_name": "Conv2D", "config": {"name": "conv2d_2", "trainable": true, "filters": 10, "kernel_size": [1, 2], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "conv2d_3", "class_name": "Conv2D", "config": {"name": "conv2d_3", "trainable": true, "filters": 10, "kernel_size": [1, 3], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "conv2d_4", "class_name": "Conv2D", "config": {"name": "conv2d_4", "trainable": true, "filters": 10, "kernel_size": [1, 5], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "activation_1", "class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_1", 0, 0, {}]]]}, {"name": "activation_2", "class_name": "Activation", "config": {"name": "activation_2", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_2", 0, 0, {}]]]}, {"name": "activation_3", "class_name": "Activation", "config": {"name": "activation_3", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_3", 0, 0, {}]]]}, {"name": "activation_4", "class_name": "Activation", "config": {"name": "activation_4", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_4", 0, 0, {}]]]}, {"name": "concatenate_1", "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "axis": -1}, "inbound_nodes": [[["main_input", 0, 0, {}], ["activation_1", 0, 0, {}], ["activation_2", 0, 0, {}], ["activation_3", 0, 0, {}], ["activation_4", 0, 0, {}]]]}, {"name": "reshape_1", "class_name": "Reshape", "config": {"name": "reshape_1", "trainable": true, "target_shape": [24, 47]}, "inbound_nodes": [[["concatenate_1", 0, 0, {}]]]}, {"name": "bidirectional_1", "class_name": "Bidirectional", "config": {"name": "bidirectional_1", "trainable": true, "layer": {"class_name": "LSTM", "config": {"name": "LSTM_out", "trainable": true, "batch_input_shape": [null, 24, 47], "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 15, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 1}}, "merge_mode": "concat"}, "inbound_nodes": [[["reshape_1", 0, 0, {}]]]}, {"name": "flatten_1", "class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "data_format": "channels_last"}, "inbound_nodes": [[["bidirectional_1", 0, 0, {}]]]}, {"name": "dense_1", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 80, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["flatten_1", 0, 0, {}]]]}, {"name": "dense_2", "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 20, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"name": "dropout_1", "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.35, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_2", 0, 0, {}]]]}, {"name": "main_output", "class_name": "Dense", "config": {"name": "main_output", "trainable": true, "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}], "input_layers": [["main_input", 0, 0]], "output_layers": [["main_output", 0, 0]]}, "keras_version": "2.2.4", "backend": "tensorflow"}
|
cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7f6aa381520f5c68fa1f099a6ef3ebc3b8ce846709b97dfde2053f26ca62f80
|
3 |
+
size 312432
|
cas9_model/on-cla.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5acf8f740cf326052ad08db2ca71d7204526c61f6a9fcdca36e15004bc16ad04
|
3 |
+
size 34044032
|
cas9off.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
# column names
|
8 |
+
ID_COL = 'Transcript ID'
|
9 |
+
SEQ_COL = 'Transcript Sequence'
|
10 |
+
|
11 |
+
# configure GPUs
|
12 |
+
for gpu in tf.config.list_physical_devices('GPU'):
|
13 |
+
tf.config.experimental.set_memory_growth(gpu, enable=True)
|
14 |
+
if len(tf.config.list_physical_devices('GPU')) > 0:
|
15 |
+
tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
|
16 |
+
|
17 |
+
# application configuration
|
18 |
+
BATCH_SIZE_COMPUTE = 500
|
19 |
+
BATCH_SIZE_SCAN = 20
|
20 |
+
BATCH_SIZE_TRANSCRIPTS = 50
|
21 |
+
NUM_TOP_GUIDES = 10
|
22 |
+
NUM_MISMATCHES = 3
|
23 |
+
RUN_MODES = dict(
|
24 |
+
all='All on-target guides per transcript',
|
25 |
+
top_guides='Top {:d} guides per transcript'.format(NUM_TOP_GUIDES),
|
26 |
+
titration='Top {:d} guides per transcript & their titration candidates'.format(NUM_TOP_GUIDES)
|
27 |
+
)
|
28 |
+
|
29 |
+
class Encoder:
|
30 |
+
def __init__(self, on_seq, off_seq, with_category = False, label = None, with_reg_val = False, value = None):
|
31 |
+
tlen = 24
|
32 |
+
self.on_seq = "-" *(tlen-len(on_seq)) + on_seq
|
33 |
+
self.off_seq = "-" *(tlen-len(off_seq)) + off_seq
|
34 |
+
self.encoded_dict_indel = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0],
|
35 |
+
'G': [0, 0, 1, 0, 0], 'C': [0, 0, 0, 1, 0], '_': [0, 0, 0, 0, 1], '-': [0, 0, 0, 0, 0]}
|
36 |
+
self.direction_dict = {'A':5, 'G':4, 'C':3, 'T':2, '_':1}
|
37 |
+
if with_category:
|
38 |
+
self.label = label
|
39 |
+
if with_reg_val:
|
40 |
+
self.value = value
|
41 |
+
self.encode_on_off_dim7()
|
42 |
+
|
43 |
+
def encode_sgRNA(self):
|
44 |
+
code_list = []
|
45 |
+
encoded_dict = self.encoded_dict_indel
|
46 |
+
sgRNA_bases = list(self.on_seq)
|
47 |
+
for i in range(len(sgRNA_bases)):
|
48 |
+
if sgRNA_bases[i] == "N":
|
49 |
+
sgRNA_bases[i] = list(self.off_seq)[i]
|
50 |
+
code_list.append(encoded_dict[sgRNA_bases[i]])
|
51 |
+
self.sgRNA_code = np.array(code_list)
|
52 |
+
|
53 |
+
def encode_off(self):
|
54 |
+
code_list = []
|
55 |
+
encoded_dict = self.encoded_dict_indel
|
56 |
+
off_bases = list(self.off_seq)
|
57 |
+
for i in range(len(off_bases)):
|
58 |
+
code_list.append(encoded_dict[off_bases[i]])
|
59 |
+
self.off_code = np.array(code_list)
|
60 |
+
|
61 |
+
def encode_on_off_dim7(self):
|
62 |
+
self.encode_sgRNA()
|
63 |
+
self.encode_off()
|
64 |
+
on_bases = list(self.on_seq)
|
65 |
+
off_bases = list(self.off_seq)
|
66 |
+
on_off_dim7_codes = []
|
67 |
+
for i in range(len(on_bases)):
|
68 |
+
diff_code = np.bitwise_or(self.sgRNA_code[i], self.off_code[i])
|
69 |
+
on_b = on_bases[i]
|
70 |
+
off_b = off_bases[i]
|
71 |
+
if on_b == "N":
|
72 |
+
on_b = off_b
|
73 |
+
dir_code = np.zeros(2)
|
74 |
+
if on_b == "-" or off_b == "-" or self.direction_dict[on_b] == self.direction_dict[off_b]:
|
75 |
+
pass
|
76 |
+
else:
|
77 |
+
if self.direction_dict[on_b] > self.direction_dict[off_b]:
|
78 |
+
dir_code[0] = 1
|
79 |
+
else:
|
80 |
+
dir_code[1] = 1
|
81 |
+
on_off_dim7_codes.append(np.concatenate((diff_code, dir_code)))
|
82 |
+
self.on_off_code = np.array(on_off_dim7_codes)
|
83 |
+
|
84 |
+
def encode_on_off_seq_pairs(input_file):
|
85 |
+
inputs = pd.read_csv(input_file, delimiter=",", header=None, names=['on_seq', 'off_seq'])
|
86 |
+
input_codes = []
|
87 |
+
for idx, row in inputs.iterrows():
|
88 |
+
on_seq = row['on_seq']
|
89 |
+
off_seq = row['off_seq']
|
90 |
+
en = Encoder(on_seq=on_seq, off_seq=off_seq)
|
91 |
+
input_codes.append(en.on_off_code)
|
92 |
+
input_codes = np.array(input_codes)
|
93 |
+
input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
|
94 |
+
y_pred = CRISPR_net_predict(input_codes)
|
95 |
+
inputs['CRISPR_Net_score'] = y_pred
|
96 |
+
inputs.to_csv("CRISPR_net_results.csv", index=False)
|
97 |
+
|
98 |
+
def CRISPR_net_predict(X_test):
|
99 |
+
json_file = open("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json", 'r')
|
100 |
+
loaded_model_json = json_file.read()
|
101 |
+
json_file.close()
|
102 |
+
loaded_model = tf.keras.models.model_from_json(loaded_model_json) # Updated for TensorFlow 2
|
103 |
+
loaded_model.load_weights("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5")
|
104 |
+
y_pred = loaded_model.predict(X_test).flatten()
|
105 |
+
return y_pred
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
parser = argparse.ArgumentParser(description="CRISPR-Net v1.0 (Aug 10 2019)")
|
109 |
+
parser.add_argument("input_file",
|
110 |
+
help="input_file example (on-target seq, off-target seq):\n GAGT_CCGAGCAGAAGAAGAATGG,GAGTACCAAGTAGAAGAAAAATTT\n"
|
111 |
+
"GTTGCCCCACAGGGCAGTAAAGG,GTGGACACCCCGGGCAGGAAAGG\n"
|
112 |
+
"GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG")
|
113 |
+
args = parser.parse_args()
|
114 |
+
file = args.input_file
|
115 |
+
if not os.path.exists(args.input_file):
|
116 |
+
print("File doesn't exist!")
|
117 |
+
else:
|
118 |
+
encode_on_off_seq_pairs(file)
|
119 |
+
tf.keras.backend.clear_session()
|
cas9on.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from operator import add
|
5 |
+
from functools import reduce
|
6 |
+
|
7 |
+
# configure GPUs
|
8 |
+
for gpu in tf.config.list_physical_devices('GPU'):
|
9 |
+
tf.config.experimental.set_memory_growth(gpu, enable=True)
|
10 |
+
if len(tf.config.list_physical_devices('GPU')) > 0:
|
11 |
+
tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
|
12 |
+
|
13 |
+
|
14 |
+
ntmap = {'A': (1, 0, 0, 0),
|
15 |
+
'C': (0, 1, 0, 0),
|
16 |
+
'G': (0, 0, 1, 0),
|
17 |
+
'T': (0, 0, 0, 1)
|
18 |
+
}
|
19 |
+
epimap = {'A': 1, 'N': 0}
|
20 |
+
|
21 |
+
|
22 |
+
def get_seqcode(seq):
|
23 |
+
return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape(
|
24 |
+
(1, len(seq), -1))
|
25 |
+
|
26 |
+
|
27 |
+
def get_epicode(eseq):
|
28 |
+
return np.array(list(map(lambda c: epimap[c], eseq))).reshape(1, len(eseq), -1)
|
29 |
+
|
30 |
+
class Episgt:
|
31 |
+
def __init__(self, fpath, num_epi_features, with_y=True):
|
32 |
+
self._fpath = fpath
|
33 |
+
self._ori_df = pd.read_csv(fpath, sep='\t', index_col=None, header=None)
|
34 |
+
self._num_epi_features = num_epi_features
|
35 |
+
self._with_y = with_y
|
36 |
+
self._num_cols = num_epi_features + 2 if with_y else num_epi_features + 1
|
37 |
+
self._cols = list(self._ori_df.columns)[-self._num_cols:]
|
38 |
+
self._df = self._ori_df[self._cols]
|
39 |
+
|
40 |
+
@property
|
41 |
+
def length(self):
|
42 |
+
return len(self._df)
|
43 |
+
|
44 |
+
def get_dataset(self, x_dtype=np.float32, y_dtype=np.float32):
|
45 |
+
x_seq = np.concatenate(list(map(get_seqcode, self._df[self._cols[0]])))
|
46 |
+
x_epis = np.concatenate([np.concatenate(list(map(get_epicode, self._df[col]))) for col in
|
47 |
+
self._cols[1: 1 + self._num_epi_features]], axis=-1)
|
48 |
+
x = np.concatenate([x_seq, x_epis], axis=-1).astype(x_dtype)
|
49 |
+
x = x.transpose(0, 2, 1)
|
50 |
+
if self._with_y:
|
51 |
+
y = np.array(self._df[self._cols[-1]]).astype(y_dtype)
|
52 |
+
return x, y
|
53 |
+
else:
|
54 |
+
return x
|
55 |
+
|
56 |
+
from keras.models import load_model
|
57 |
+
|
58 |
+
class DCModelOntar:
|
59 |
+
def __init__(self, ontar_model_dir, is_reg=False):
|
60 |
+
if is_reg:
|
61 |
+
self.model = load_model(ontar_model_dir)
|
62 |
+
else:
|
63 |
+
self.model = load_model(ontar_model_dir)
|
64 |
+
|
65 |
+
def ontar_predict(self, x, channel_first=True):
|
66 |
+
if channel_first:
|
67 |
+
x = x.transpose([0, 2, 3, 1])
|
68 |
+
yp = self.model.predict(x)
|
69 |
+
return yp.ravel()
|
70 |
+
|
71 |
+
def predict():
|
72 |
+
file_path = 'eg_cls_on_target.episgt'
|
73 |
+
input_data = Episgt(file_path, num_epi_features=4, with_y=True)
|
74 |
+
x, y = input_data.get_dataset()
|
75 |
+
x = np.expand_dims(x, axis=2) # shape(x) = [100, 8, 1, 23]
|
76 |
+
dcModel = DCModelOntar('on-cla.h5')
|
77 |
+
predicted_on_target = dcModel.ontar_predict(x)
|
78 |
+
return predicted_on_target
|
tiger.md → crisprTool.md
RENAMED
File without changes
|