supercat666 commited on
Commit
ad9ec7b
·
1 Parent(s): 4fa4501
Files changed (3) hide show
  1. app.py +1 -1
  2. cas12.py +101 -88
  3. cas9on.py +21 -17
app.py CHANGED
@@ -148,7 +148,7 @@ if selected_model == 'Cas9':
148
  # Process predictions
149
  if predict_button and gene_symbol:
150
  with st.spinner('Predicting... Please wait'):
151
- predictions, gene_sequence, exons = cas9on.process_gene(gene_symbol, cas9on_path)
152
  sorted_predictions = sorted(predictions, key=lambda x: x[-1], reverse=True)[:10]
153
  st.session_state['on_target_results'] = sorted_predictions
154
  st.session_state['gene_sequence'] = gene_sequence # Save gene sequence in session state
 
148
  # Process predictions
149
  if predict_button and gene_symbol:
150
  with st.spinner('Predicting... Please wait'):
151
+ predictions, gene_sequence, exons = cas9on.process_gene(gene_symbol, cas9on_path)
152
  sorted_predictions = sorted(predictions, key=lambda x: x[-1], reverse=True)[:10]
153
  st.session_state['on_target_results'] = sorted_predictions
154
  st.session_state['gene_sequence'] = gene_sequence # Save gene sequence in session state
cas12.py CHANGED
@@ -87,113 +87,126 @@ def fetch_ensembl_sequence(transcript_id):
87
  print(f"Error fetching sequence data from Ensembl: {response.text}")
88
  return None
89
 
90
-
91
- def find_crispr_targets(sequence, chr, start, strand, pam="TTTN", target_length=34):
92
  targets = []
93
  len_sequence = len(sequence)
 
 
 
 
 
94
 
95
  for i in range(len_sequence - target_length + 1):
96
  target_seq = sequence[i:i + target_length]
97
  if target_seq[4:7] == 'TTT':
98
  tar_start = start + i
99
  tar_end = start + i + target_length
100
- gRNA = target_seq[8:28]
101
- targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
102
  return targets
103
 
104
- def format_prediction_output(targets, seq_deepCpf1):
 
 
 
 
105
  formatted_data = []
106
  for target in targets:
107
  # Predict
108
- encoded_seq = get_seqcode(target[0]) # 'target' seems to be the full sequence including PAM
109
- prediction = seq_deepCpf1.predict(encoded_seq)
 
 
 
110
  # Format output
111
- gRNA = target[1] # gRNA is presumably the guide RNA sequence
112
- chr = target[2] # Chromosome
113
- start = target[3] # Start position
114
- end = target[4] # End position
115
- strand = target[5] # Strand
116
- target_seq = target[0] # Full target sequence including PAM
117
- formatted_data.append([chr, start, end, strand, target_seq, gRNA, prediction[0][0]])
 
 
118
  return formatted_data
119
 
120
  def process_gene(gene_symbol, model_path):
121
  transcripts = fetch_ensembl_transcripts(gene_symbol)
122
- all_data = []
123
- gene_sequence = '' # Initialize an empty string for the gene sequence
124
-
125
- # Load the model
126
- seq_deepCpf1 = Seq_DeepCpf1_model(input_shape=(34, 4))
127
- seq_deepCpf1.load_weights(model_path)
128
-
129
  if transcripts:
130
- for transcript in transcripts:
131
- transcript_id = transcript['id']
132
- chr = transcript.get('seq_region_name', 'unknown')
133
- start = transcript.get('start', 0)
134
- strand = transcript.get('strand', 'unknown')
135
- # Fetch the sequence here and concatenate if multiple transcripts
136
- gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
137
-
138
- if gene_sequence:
139
- targets = find_crispr_targets(gene_sequence, chr, start, strand)
140
- if targets:
141
- formatted_data = format_prediction_output(targets, seq_deepCpf1)
142
- all_data.extend(formatted_data)
 
 
 
 
 
 
143
  else:
144
  print("Failed to retrieve transcripts.")
145
 
146
- return all_data, gene_sequence
147
-
148
-
149
- def create_genbank_features(formatted_data):
150
- features = []
151
- for data in formatted_data:
152
- try:
153
- # Attempt to convert start and end positions to integers
154
- start = int(data[1])
155
- end = int(data[2])
156
- except ValueError as e:
157
- # Log the error and skip this iteration if conversion fails
158
- print(f"Error converting start/end to int: {data[1]}, {data[2]} - {e}")
159
- continue # Skip this iteration
160
-
161
- # Proceed as normal if conversion is successful
162
- strand = 1 if data[3] == '+' else -1
163
- location = FeatureLocation(start=start, end=end, strand=strand)
164
- feature = SeqFeature(location=location, type="misc_feature", qualifiers={
165
- 'label': data[5], # gRNA as label
166
- 'note': f"Prediction: {data[6]}" # Prediction score in note
167
- })
168
- features.append(feature)
169
- return features
170
-
171
- def generate_genbank_file_from_data(formatted_data, gene_sequence, gene_symbol, output_path):
172
- features = create_genbank_features(formatted_data)
173
- record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
174
- description='CRISPR Cas12 predicted targets', features=features)
175
- record.annotations["molecule_type"] = "DNA"
176
- SeqIO.write(record, output_path, "genbank")
177
-
178
- def create_csv_from_df(df, output_path):
179
- df.to_csv(output_path, index=False)
180
-
181
- def generate_bed_file_from_data(formatted_data, output_path):
182
- with open(output_path, 'w') as bed_file:
183
- for data in formatted_data:
184
- try:
185
- # Ensure data has the expected number of elements
186
- if len(data) < 7:
187
- raise ValueError("Incomplete data item")
188
-
189
- chrom = data[0]
190
- start = data[1]
191
- end = data[2]
192
- strand = '+' if data[3] == '+' else '-'
193
- gRNA = data[5]
194
- score = data[6] # Ensure this index exists
195
-
196
- bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
197
- except ValueError as e:
198
- print(f"Skipping an item due to error: {e}")
199
- continue
 
87
  print(f"Error fetching sequence data from Ensembl: {response.text}")
88
  return None
89
 
90
+ def find_crispr_targets(sequence, chr, start, strand, transcript_id, exon_id, pam="TTTN", target_length=34):
 
91
  targets = []
92
  len_sequence = len(sequence)
93
+ complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
94
+ dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
95
+
96
+ if strand == -1:
97
+ sequence = ''.join([complement[base] for base in sequence])
98
 
99
  for i in range(len_sequence - target_length + 1):
100
  target_seq = sequence[i:i + target_length]
101
  if target_seq[4:7] == 'TTT':
102
  tar_start = start + i
103
  tar_end = start + i + target_length
104
+ gRNA = ''.join([dnatorna[base] for base in target_seq[8:28]])
105
+ targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
106
  return targets
107
 
108
+ def format_prediction_output(targets, model_path):
109
+ # Loading weights for the model
110
+ Seq_deepCpf1 = Seq_DeepCpf1_model(input_shape=(34, 4))
111
+ Seq_deepCpf1.load_weights(model_path)
112
+
113
  formatted_data = []
114
  for target in targets:
115
  # Predict
116
+ encoded_seq = get_seqcode(target[0])
117
+ prediction = float(list(Seq_deepCpf1.predict(encoded_seq)[0])[0])
118
+ if prediction > 100:
119
+ prediction = 100
120
+
121
  # Format output
122
+ gRNA = target[1]
123
+ chr = target[2]
124
+ start = target[3]
125
+ end = target[4]
126
+ strand = target[5]
127
+ transcript_id = target[6]
128
+ exon_id = target[7]
129
+ formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, prediction])
130
+
131
  return formatted_data
132
 
133
  def process_gene(gene_symbol, model_path):
134
  transcripts = fetch_ensembl_transcripts(gene_symbol)
135
+ results = []
 
 
 
 
 
 
136
  if transcripts:
137
+ for i in range(len(transcripts)):
138
+ Exons = transcripts[i]['Exon']
139
+ transcript_id = transcripts[i]['id']
140
+ for j in range(len(Exons)):
141
+ exon_id = Exons[j]['id']
142
+ gene_sequence = fetch_ensembl_sequence(exon_id)
143
+ if gene_sequence:
144
+ start = Exons[j]['start']
145
+ strand = Exons[j]['strand']
146
+ chr = Exons[j]['seq_region_name']
147
+ targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
148
+ if targets:
149
+ formatted_data = format_prediction_output(targets,
150
+ '/content/drive/MyDrive/Colab Notebooks/DeepCpf1/Seq_deepCpf1_weights.h5')
151
+ results.append(formatted_data)
152
+ # for data in formatted_data:
153
+ # print(f"Chr: {data[0]}, Start: {data[1]}, End: {data[2]}, Strand: {data[3]}, target: {data[4]}, gRNA: {data[5]}, pred_Score: {data[6]}")
154
+ else:
155
+ print("Failed to retrieve gene sequence.")
156
  else:
157
  print("Failed to retrieve transcripts.")
158
 
159
+ return results, gene_sequence, Exons
160
+
161
+
162
+ # def create_genbank_features(formatted_data):
163
+ # features = []
164
+ # for data in formatted_data:
165
+ # try:
166
+ # # Attempt to convert start and end positions to integers
167
+ # start = int(data[1])
168
+ # end = int(data[2])
169
+ # except ValueError as e:
170
+ # # Log the error and skip this iteration if conversion fails
171
+ # print(f"Error converting start/end to int: {data[1]}, {data[2]} - {e}")
172
+ # continue # Skip this iteration
173
+ #
174
+ # # Proceed as normal if conversion is successful
175
+ # strand = 1 if data[3] == '+' else -1
176
+ # location = FeatureLocation(start=start, end=end, strand=strand)
177
+ # feature = SeqFeature(location=location, type="misc_feature", qualifiers={
178
+ # 'label': data[5], # gRNA as label
179
+ # 'note': f"Prediction: {data[6]}" # Prediction score in note
180
+ # })
181
+ # features.append(feature)
182
+ # return features
183
+ #
184
+ # def generate_genbank_file_from_data(formatted_data, gene_sequence, gene_symbol, output_path):
185
+ # features = create_genbank_features(formatted_data)
186
+ # record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
187
+ # description='CRISPR Cas12 predicted targets', features=features)
188
+ # record.annotations["molecule_type"] = "DNA"
189
+ # SeqIO.write(record, output_path, "genbank")
190
+ #
191
+ # def create_csv_from_df(df, output_path):
192
+ # df.to_csv(output_path, index=False)
193
+ #
194
+ # def generate_bed_file_from_data(formatted_data, output_path):
195
+ # with open(output_path, 'w') as bed_file:
196
+ # for data in formatted_data:
197
+ # try:
198
+ # # Ensure data has the expected number of elements
199
+ # if len(data) < 7:
200
+ # raise ValueError("Incomplete data item")
201
+ #
202
+ # chrom = data[0]
203
+ # start = data[1]
204
+ # end = data[2]
205
+ # strand = '+' if data[3] == '+' else '-'
206
+ # gRNA = data[5]
207
+ # score = data[6] # Ensure this index exists
208
+ #
209
+ # bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
210
+ # except ValueError as e:
211
+ # print(f"Skipping an item due to error: {e}")
212
+ # continue
cas9on.py CHANGED
@@ -115,31 +115,35 @@ def format_prediction_output(targets, model_path):
115
  def process_gene(gene_symbol, model_path):
116
  transcripts = fetch_ensembl_transcripts(gene_symbol)
117
  results = []
 
 
 
118
  if transcripts:
119
- for i in range(len(transcripts)):
120
- Exons = transcripts[i]['Exon']
121
- transcript_id = transcripts[i]['id']
122
- for j in range(len(Exons)):
123
- exon_id = Exons[j]['id']
 
 
124
  gene_sequence = fetch_ensembl_sequence(exon_id)
125
  if gene_sequence:
126
- start = Exons[j]['start']
127
- strand = Exons[j]['strand']
128
- chr = Exons[j]['seq_region_name']
 
129
  targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
130
- if not targets:
131
- print("No gRNA sites found in the gene sequence.")
132
- else:
133
  # Predict on-target efficiency for each gRNA site
134
- formatted_data = format_prediction_output(targets,model_path)
135
- results.append(formatted_data)
136
- # for data in formatted_data:
137
- # print(f"Chr: {data[0]}, Start: {data[1]}, End: {data[2]}, Strand: {data[3]}, gRNA: {data[4]}, pred_Score: {data[5]}")
138
  else:
139
- print("Failed to retrieve gene sequence.")
140
  else:
141
  print("Failed to retrieve transcripts.")
142
- return results, gene_sequence, Exons
 
 
143
 
144
 
145
  # def create_genbank_features(formatted_data):
 
115
  def process_gene(gene_symbol, model_path):
116
  transcripts = fetch_ensembl_transcripts(gene_symbol)
117
  results = []
118
+ all_exons = [] # To accumulate all exons
119
+ all_gene_sequences = [] # To accumulate all gene sequences
120
+
121
  if transcripts:
122
+ for transcript in transcripts:
123
+ Exons = transcript['Exon']
124
+ all_exons.extend(Exons) # Add all exons from this transcript to the list
125
+ transcript_id = transcript['id']
126
+
127
+ for exon in Exons:
128
+ exon_id = exon['id']
129
  gene_sequence = fetch_ensembl_sequence(exon_id)
130
  if gene_sequence:
131
+ all_gene_sequences.append(gene_sequence) # Add this gene sequence to the list
132
+ start = exon['start']
133
+ strand = exon['strand']
134
+ chr = exon['seq_region_name']
135
  targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
136
+ if targets:
 
 
137
  # Predict on-target efficiency for each gRNA site
138
+ formatted_data = format_prediction_output(targets, model_path)
139
+ results.extend(formatted_data)
 
 
140
  else:
141
+ print(f"Failed to retrieve gene sequence for exon {exon_id}.")
142
  else:
143
  print("Failed to retrieve transcripts.")
144
+
145
+ # Return the sorted output, combined gene sequences, and all exons
146
+ return results, all_gene_sequences, all_exons
147
 
148
 
149
  # def create_genbank_features(formatted_data):