Spaces:
Sleeping
Sleeping
supercat666
commited on
Commit
·
ad9ec7b
1
Parent(s):
4fa4501
fix
Browse files
app.py
CHANGED
@@ -148,7 +148,7 @@ if selected_model == 'Cas9':
|
|
148 |
# Process predictions
|
149 |
if predict_button and gene_symbol:
|
150 |
with st.spinner('Predicting... Please wait'):
|
151 |
-
predictions, gene_sequence, exons
|
152 |
sorted_predictions = sorted(predictions, key=lambda x: x[-1], reverse=True)[:10]
|
153 |
st.session_state['on_target_results'] = sorted_predictions
|
154 |
st.session_state['gene_sequence'] = gene_sequence # Save gene sequence in session state
|
|
|
148 |
# Process predictions
|
149 |
if predict_button and gene_symbol:
|
150 |
with st.spinner('Predicting... Please wait'):
|
151 |
+
predictions, gene_sequence, exons = cas9on.process_gene(gene_symbol, cas9on_path)
|
152 |
sorted_predictions = sorted(predictions, key=lambda x: x[-1], reverse=True)[:10]
|
153 |
st.session_state['on_target_results'] = sorted_predictions
|
154 |
st.session_state['gene_sequence'] = gene_sequence # Save gene sequence in session state
|
cas12.py
CHANGED
@@ -87,113 +87,126 @@ def fetch_ensembl_sequence(transcript_id):
|
|
87 |
print(f"Error fetching sequence data from Ensembl: {response.text}")
|
88 |
return None
|
89 |
|
90 |
-
|
91 |
-
def find_crispr_targets(sequence, chr, start, strand, pam="TTTN", target_length=34):
|
92 |
targets = []
|
93 |
len_sequence = len(sequence)
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
for i in range(len_sequence - target_length + 1):
|
96 |
target_seq = sequence[i:i + target_length]
|
97 |
if target_seq[4:7] == 'TTT':
|
98 |
tar_start = start + i
|
99 |
tar_end = start + i + target_length
|
100 |
-
gRNA = target_seq[8:28]
|
101 |
-
targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
|
102 |
return targets
|
103 |
|
104 |
-
def format_prediction_output(targets,
|
|
|
|
|
|
|
|
|
105 |
formatted_data = []
|
106 |
for target in targets:
|
107 |
# Predict
|
108 |
-
encoded_seq = get_seqcode(target[0])
|
109 |
-
prediction =
|
|
|
|
|
|
|
110 |
# Format output
|
111 |
-
gRNA = target[1]
|
112 |
-
chr = target[2]
|
113 |
-
start = target[3]
|
114 |
-
end = target[4]
|
115 |
-
strand = target[5]
|
116 |
-
|
117 |
-
|
|
|
|
|
118 |
return formatted_data
|
119 |
|
120 |
def process_gene(gene_symbol, model_path):
|
121 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
122 |
-
|
123 |
-
gene_sequence = '' # Initialize an empty string for the gene sequence
|
124 |
-
|
125 |
-
# Load the model
|
126 |
-
seq_deepCpf1 = Seq_DeepCpf1_model(input_shape=(34, 4))
|
127 |
-
seq_deepCpf1.load_weights(model_path)
|
128 |
-
|
129 |
if transcripts:
|
130 |
-
for
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
else:
|
144 |
print("Failed to retrieve transcripts.")
|
145 |
|
146 |
-
return
|
147 |
-
|
148 |
-
|
149 |
-
def create_genbank_features(formatted_data):
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
def generate_genbank_file_from_data(formatted_data, gene_sequence, gene_symbol, output_path):
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
def create_csv_from_df(df, output_path):
|
179 |
-
|
180 |
-
|
181 |
-
def generate_bed_file_from_data(formatted_data, output_path):
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
87 |
print(f"Error fetching sequence data from Ensembl: {response.text}")
|
88 |
return None
|
89 |
|
90 |
+
def find_crispr_targets(sequence, chr, start, strand, transcript_id, exon_id, pam="TTTN", target_length=34):
|
|
|
91 |
targets = []
|
92 |
len_sequence = len(sequence)
|
93 |
+
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
|
94 |
+
dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
|
95 |
+
|
96 |
+
if strand == -1:
|
97 |
+
sequence = ''.join([complement[base] for base in sequence])
|
98 |
|
99 |
for i in range(len_sequence - target_length + 1):
|
100 |
target_seq = sequence[i:i + target_length]
|
101 |
if target_seq[4:7] == 'TTT':
|
102 |
tar_start = start + i
|
103 |
tar_end = start + i + target_length
|
104 |
+
gRNA = ''.join([dnatorna[base] for base in target_seq[8:28]])
|
105 |
+
targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
|
106 |
return targets
|
107 |
|
108 |
+
def format_prediction_output(targets, model_path):
|
109 |
+
# Loading weights for the model
|
110 |
+
Seq_deepCpf1 = Seq_DeepCpf1_model(input_shape=(34, 4))
|
111 |
+
Seq_deepCpf1.load_weights(model_path)
|
112 |
+
|
113 |
formatted_data = []
|
114 |
for target in targets:
|
115 |
# Predict
|
116 |
+
encoded_seq = get_seqcode(target[0])
|
117 |
+
prediction = float(list(Seq_deepCpf1.predict(encoded_seq)[0])[0])
|
118 |
+
if prediction > 100:
|
119 |
+
prediction = 100
|
120 |
+
|
121 |
# Format output
|
122 |
+
gRNA = target[1]
|
123 |
+
chr = target[2]
|
124 |
+
start = target[3]
|
125 |
+
end = target[4]
|
126 |
+
strand = target[5]
|
127 |
+
transcript_id = target[6]
|
128 |
+
exon_id = target[7]
|
129 |
+
formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, prediction])
|
130 |
+
|
131 |
return formatted_data
|
132 |
|
133 |
def process_gene(gene_symbol, model_path):
|
134 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
135 |
+
results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
if transcripts:
|
137 |
+
for i in range(len(transcripts)):
|
138 |
+
Exons = transcripts[i]['Exon']
|
139 |
+
transcript_id = transcripts[i]['id']
|
140 |
+
for j in range(len(Exons)):
|
141 |
+
exon_id = Exons[j]['id']
|
142 |
+
gene_sequence = fetch_ensembl_sequence(exon_id)
|
143 |
+
if gene_sequence:
|
144 |
+
start = Exons[j]['start']
|
145 |
+
strand = Exons[j]['strand']
|
146 |
+
chr = Exons[j]['seq_region_name']
|
147 |
+
targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
|
148 |
+
if targets:
|
149 |
+
formatted_data = format_prediction_output(targets,
|
150 |
+
'/content/drive/MyDrive/Colab Notebooks/DeepCpf1/Seq_deepCpf1_weights.h5')
|
151 |
+
results.append(formatted_data)
|
152 |
+
# for data in formatted_data:
|
153 |
+
# print(f"Chr: {data[0]}, Start: {data[1]}, End: {data[2]}, Strand: {data[3]}, target: {data[4]}, gRNA: {data[5]}, pred_Score: {data[6]}")
|
154 |
+
else:
|
155 |
+
print("Failed to retrieve gene sequence.")
|
156 |
else:
|
157 |
print("Failed to retrieve transcripts.")
|
158 |
|
159 |
+
return results, gene_sequence, Exons
|
160 |
+
|
161 |
+
|
162 |
+
# def create_genbank_features(formatted_data):
|
163 |
+
# features = []
|
164 |
+
# for data in formatted_data:
|
165 |
+
# try:
|
166 |
+
# # Attempt to convert start and end positions to integers
|
167 |
+
# start = int(data[1])
|
168 |
+
# end = int(data[2])
|
169 |
+
# except ValueError as e:
|
170 |
+
# # Log the error and skip this iteration if conversion fails
|
171 |
+
# print(f"Error converting start/end to int: {data[1]}, {data[2]} - {e}")
|
172 |
+
# continue # Skip this iteration
|
173 |
+
#
|
174 |
+
# # Proceed as normal if conversion is successful
|
175 |
+
# strand = 1 if data[3] == '+' else -1
|
176 |
+
# location = FeatureLocation(start=start, end=end, strand=strand)
|
177 |
+
# feature = SeqFeature(location=location, type="misc_feature", qualifiers={
|
178 |
+
# 'label': data[5], # gRNA as label
|
179 |
+
# 'note': f"Prediction: {data[6]}" # Prediction score in note
|
180 |
+
# })
|
181 |
+
# features.append(feature)
|
182 |
+
# return features
|
183 |
+
#
|
184 |
+
# def generate_genbank_file_from_data(formatted_data, gene_sequence, gene_symbol, output_path):
|
185 |
+
# features = create_genbank_features(formatted_data)
|
186 |
+
# record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
|
187 |
+
# description='CRISPR Cas12 predicted targets', features=features)
|
188 |
+
# record.annotations["molecule_type"] = "DNA"
|
189 |
+
# SeqIO.write(record, output_path, "genbank")
|
190 |
+
#
|
191 |
+
# def create_csv_from_df(df, output_path):
|
192 |
+
# df.to_csv(output_path, index=False)
|
193 |
+
#
|
194 |
+
# def generate_bed_file_from_data(formatted_data, output_path):
|
195 |
+
# with open(output_path, 'w') as bed_file:
|
196 |
+
# for data in formatted_data:
|
197 |
+
# try:
|
198 |
+
# # Ensure data has the expected number of elements
|
199 |
+
# if len(data) < 7:
|
200 |
+
# raise ValueError("Incomplete data item")
|
201 |
+
#
|
202 |
+
# chrom = data[0]
|
203 |
+
# start = data[1]
|
204 |
+
# end = data[2]
|
205 |
+
# strand = '+' if data[3] == '+' else '-'
|
206 |
+
# gRNA = data[5]
|
207 |
+
# score = data[6] # Ensure this index exists
|
208 |
+
#
|
209 |
+
# bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
|
210 |
+
# except ValueError as e:
|
211 |
+
# print(f"Skipping an item due to error: {e}")
|
212 |
+
# continue
|
cas9on.py
CHANGED
@@ -115,31 +115,35 @@ def format_prediction_output(targets, model_path):
|
|
115 |
def process_gene(gene_symbol, model_path):
|
116 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
117 |
results = []
|
|
|
|
|
|
|
118 |
if transcripts:
|
119 |
-
for
|
120 |
-
Exons =
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
124 |
gene_sequence = fetch_ensembl_sequence(exon_id)
|
125 |
if gene_sequence:
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
129 |
targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
|
130 |
-
if
|
131 |
-
print("No gRNA sites found in the gene sequence.")
|
132 |
-
else:
|
133 |
# Predict on-target efficiency for each gRNA site
|
134 |
-
formatted_data = format_prediction_output(targets,model_path)
|
135 |
-
results.
|
136 |
-
# for data in formatted_data:
|
137 |
-
# print(f"Chr: {data[0]}, Start: {data[1]}, End: {data[2]}, Strand: {data[3]}, gRNA: {data[4]}, pred_Score: {data[5]}")
|
138 |
else:
|
139 |
-
print("Failed to retrieve gene sequence.")
|
140 |
else:
|
141 |
print("Failed to retrieve transcripts.")
|
142 |
-
|
|
|
|
|
143 |
|
144 |
|
145 |
# def create_genbank_features(formatted_data):
|
|
|
115 |
def process_gene(gene_symbol, model_path):
|
116 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
117 |
results = []
|
118 |
+
all_exons = [] # To accumulate all exons
|
119 |
+
all_gene_sequences = [] # To accumulate all gene sequences
|
120 |
+
|
121 |
if transcripts:
|
122 |
+
for transcript in transcripts:
|
123 |
+
Exons = transcript['Exon']
|
124 |
+
all_exons.extend(Exons) # Add all exons from this transcript to the list
|
125 |
+
transcript_id = transcript['id']
|
126 |
+
|
127 |
+
for exon in Exons:
|
128 |
+
exon_id = exon['id']
|
129 |
gene_sequence = fetch_ensembl_sequence(exon_id)
|
130 |
if gene_sequence:
|
131 |
+
all_gene_sequences.append(gene_sequence) # Add this gene sequence to the list
|
132 |
+
start = exon['start']
|
133 |
+
strand = exon['strand']
|
134 |
+
chr = exon['seq_region_name']
|
135 |
targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
|
136 |
+
if targets:
|
|
|
|
|
137 |
# Predict on-target efficiency for each gRNA site
|
138 |
+
formatted_data = format_prediction_output(targets, model_path)
|
139 |
+
results.extend(formatted_data)
|
|
|
|
|
140 |
else:
|
141 |
+
print(f"Failed to retrieve gene sequence for exon {exon_id}.")
|
142 |
else:
|
143 |
print("Failed to retrieve transcripts.")
|
144 |
+
|
145 |
+
# Return the sorted output, combined gene sequences, and all exons
|
146 |
+
return results, all_gene_sequences, all_exons
|
147 |
|
148 |
|
149 |
# def create_genbank_features(formatted_data):
|