supercat666 commited on
Commit
3023ae4
·
1 Parent(s): 379f333
Files changed (3) hide show
  1. app.py +19 -14
  2. cas9on.py +31 -32
  3. crisprTool.md +2 -1
app.py CHANGED
@@ -181,12 +181,17 @@ if selected_model == 'Cas9':
181
  # Include "Target" in the DataFrame's columns
182
  try:
183
  df = pd.DataFrame(st.session_state['on_target_results'],
184
- columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Target", "gRNA", "Prediction"])
185
  st.dataframe(df)
186
  except ValueError as e:
187
  st.error(f"DataFrame creation error: {e}")
188
  # Optionally print or log the problematic data for debugging:
189
  print(st.session_state['on_target_results'])
 
 
 
 
 
190
  # Initialize Plotly figure
191
  fig = go.Figure()
192
 
@@ -219,17 +224,16 @@ if selected_model == 'Cas9':
219
  name='CDS'
220
  ))
221
 
222
- # Adjust hover interaction and strand plotting
223
- MAX_STRAND_Y = 0.5 # Maximum Y value for positive strand
224
- MIN_STRAND_Y = -0.5 # Minimum Y value for negative strand
225
 
226
- # Iterate over sorted predictions to create the plot
227
- for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
228
- chrom, start, end, strand, target, gRNA, pred_score = prediction
229
  midpoint = (int(start) + int(end)) / 2
230
 
231
- # Position based on strand, but within a fixed range
232
- y_value = MAX_STRAND_Y if strand == '1' else MIN_STRAND_Y
233
 
234
  fig.add_trace(go.Scatter(
235
  x=[midpoint],
@@ -238,18 +242,19 @@ if selected_model == 'Cas9':
238
  marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
239
  text=f"Rank: {i}", # Text label
240
  hoverinfo='text',
241
- hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Prediction Score: {pred_score:.4f}",
242
  ))
243
 
244
  # Update layout for clarity and interaction
245
  fig.update_layout(
246
- title='Top 10 gRNA Sequences by Prediction Score',
247
  xaxis_title='Genomic Position',
248
- yaxis_title='Strand',
249
- yaxis=dict(range=[MIN_STRAND_Y - 0.1, MAX_STRAND_Y + 0.1]), # Fix y-axis range
 
250
  showlegend=False,
251
  hovermode='closest', # Adjust hover mode
252
- hoverdistance=20, # Reduce hover distance to improve accuracy
253
  )
254
 
255
  # Display the plot
 
181
  # Include "Target" in the DataFrame's columns
182
  try:
183
  df = pd.DataFrame(st.session_state['on_target_results'],
184
+ columns=["Chr", "Start Pos", "End Pos", "Strand", "Transcript", "Target", "gRNA", "pred_Score"])
185
  st.dataframe(df)
186
  except ValueError as e:
187
  st.error(f"DataFrame creation error: {e}")
188
  # Optionally print or log the problematic data for debugging:
189
  print(st.session_state['on_target_results'])
190
+
191
+ # Initialize Plotly figure
192
+ # Adjust hover interaction and strand plotting
193
+ MAX_STRAND_Y = 0.5 # Maximum Y value for positive strand
194
+ MIN_STRAND_Y = -0.5 # Minimum Y value for negative strand
195
  # Initialize Plotly figure
196
  fig = go.Figure()
197
 
 
224
  name='CDS'
225
  ))
226
 
227
+ # Define the vertical separation for each rank
228
+ VERTICAL_GAP = 0.2 # Gap between different ranks
 
229
 
230
+ # Iterate over top 5 sorted predictions to create the plot
231
+ for i, prediction in enumerate(st.session_state['on_target_results'][:5], start=1): # Only top 5
232
+ chrom, start, end, strand, transcript, target, gRNA, pred_score = prediction
233
  midpoint = (int(start) + int(end)) / 2
234
 
235
+ # Vertical position based on rank, spaced by VERTICAL_GAP
236
+ y_value = MAX_STRAND_Y - (i - 1) * VERTICAL_GAP
237
 
238
  fig.add_trace(go.Scatter(
239
  x=[midpoint],
 
242
  marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
243
  text=f"Rank: {i}", # Text label
244
  hoverinfo='text',
245
+ hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Transcript:{transcript}<br>Prediction Score: {pred_score:.4f}",
246
  ))
247
 
248
  # Update layout for clarity and interaction
249
  fig.update_layout(
250
+ title='Top 5 gRNA Sequences by Prediction Score',
251
  xaxis_title='Genomic Position',
252
+ yaxis_title='Rank / Strand',
253
+ yaxis=dict(range=[MAX_STRAND_Y - 5 * VERTICAL_GAP, MAX_STRAND_Y + 0.1]),
254
+ # Adjust y-axis range to fit 5 ranks
255
  showlegend=False,
256
  hovermode='closest', # Adjust hover mode
257
+ hoverdistance=10, # Reduce hover distance to improve accuracy
258
  )
259
 
260
  # Display the plot
cas9on.py CHANGED
@@ -52,12 +52,13 @@ def format_prediction_output(targets, model_path):
52
  prediction = dcModel.ontar_predict(encoded_seq)
53
 
54
  # Format output
55
- gRNA = target[1]
56
  chr = target[2]
57
  start = target[3]
58
  end = target[4]
59
  strand = target[5]
60
- formatted_data.append([chr, start, end, strand, target[0], gRNA, prediction[0]])
 
61
 
62
  return formatted_data
63
 
@@ -89,18 +90,41 @@ def fetch_ensembl_sequence(transcript_id):
89
  print(f"Error fetching sequence data from Ensembl: {response.text}")
90
  return None
91
 
92
- def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  targets = []
94
  len_sequence = len(sequence)
 
95
 
 
 
96
  for i in range(len_sequence - len(pam) + 1):
97
  if sequence[i + 1:i + 3] == pam[1:]:
98
  if i >= target_length:
99
  target_seq = sequence[i - target_length:i + 3]
100
  tar_start = start + i - target_length
101
  tar_end = start + i + 3
102
- gRNA = sequence[i - target_length:i]
103
- targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
104
 
105
  return targets
106
 
@@ -111,21 +135,16 @@ def process_gene(gene_symbol, model_path):
111
  gene_sequence = '' # Initialize an empty string for the gene sequence
112
 
113
  if transcripts:
 
114
  for transcript in transcripts:
115
- transcript_id = transcript['id']
116
  chr = transcript.get('seq_region_name', 'unknown')
117
  start = transcript.get('start', 0)
118
  strand = transcript.get('strand', 'unknown')
119
  # Fetch the sequence here and concatenate if multiple transcripts
120
  gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
121
-
122
  # Fetch exon and CDS information
123
  exons = fetch_ensembl_exons(transcript_id)
124
- cds_list = fetch_ensembl_cds(transcript_id)
125
-
126
- # You might want to do something specific with exons and CDS information here
127
- # For example, store them, print them, or include them in your analysis
128
-
129
  if gene_sequence:
130
  gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand)
131
  if gRNA_sites:
@@ -135,26 +154,6 @@ def process_gene(gene_symbol, model_path):
135
  # Return the data, fetched sequence, and possibly exon/CDS data
136
  return all_data, gene_sequence, exons, cds_list
137
 
138
- def fetch_ensembl_exons(transcript_id):
139
- """Fetch exon information for a given transcript from Ensembl."""
140
- url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
141
- response = requests.get(url)
142
- if response.status_code == 200:
143
- return response.json() # Returns a list of exons for the transcript
144
- else:
145
- print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
146
- return None
147
-
148
- def fetch_ensembl_cds(transcript_id):
149
- """Fetch coding sequence (CDS) information for a given transcript from Ensembl."""
150
- url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
151
- response = requests.get(url)
152
- if response.status_code == 200:
153
- return response.json() # Returns a list of CDS regions for the transcript
154
- else:
155
- print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
156
- return None
157
-
158
  def create_genbank_features(formatted_data):
159
  features = []
160
  for data in formatted_data:
 
52
  prediction = dcModel.ontar_predict(encoded_seq)
53
 
54
  # Format output
55
+ sgRNA = target[1]
56
  chr = target[2]
57
  start = target[3]
58
  end = target[4]
59
  strand = target[5]
60
+ transcript_id = target[6]
61
+ formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
62
 
63
  return formatted_data
64
 
 
90
  print(f"Error fetching sequence data from Ensembl: {response.text}")
91
  return None
92
 
93
+ def fetch_ensembl_exons(transcript_id):
94
+ """Fetch exon information for a given transcript from Ensembl."""
95
+ url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
96
+ response = requests.get(url)
97
+ if response.status_code == 200:
98
+ return response.json() # Returns a list of exons for the transcript
99
+ else:
100
+ print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
101
+ return None
102
+
103
+ def fetch_ensembl_cds(transcript_id):
104
+ """Fetch coding sequence (CDS) information for a given transcript from Ensembl."""
105
+ url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
106
+ response = requests.get(url)
107
+ if response.status_code == 200:
108
+ return response.json() # Returns a list of CDS regions for the transcript
109
+ else:
110
+ print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
111
+ return None
112
+
113
+ def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
114
  targets = []
115
  len_sequence = len(sequence)
116
+ complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
117
 
118
+ if strand == -1:
119
+ sequence = ''.join([complement[base] for base in reversed(sequence)])
120
  for i in range(len_sequence - len(pam) + 1):
121
  if sequence[i + 1:i + 3] == pam[1:]:
122
  if i >= target_length:
123
  target_seq = sequence[i - target_length:i + 3]
124
  tar_start = start + i - target_length
125
  tar_end = start + i + 3
126
+ sgRNA = sequence[i - target_length:i]
127
+ targets.append([target_seq, sgRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id])
128
 
129
  return targets
130
 
 
135
  gene_sequence = '' # Initialize an empty string for the gene sequence
136
 
137
  if transcripts:
138
+ cds_list = fetch_ensembl_cds(transcripts)
139
  for transcript in transcripts:
140
+ transcript_id = transcript['display_name']
141
  chr = transcript.get('seq_region_name', 'unknown')
142
  start = transcript.get('start', 0)
143
  strand = transcript.get('strand', 'unknown')
144
  # Fetch the sequence here and concatenate if multiple transcripts
145
  gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
 
146
  # Fetch exon and CDS information
147
  exons = fetch_ensembl_exons(transcript_id)
 
 
 
 
 
148
  if gene_sequence:
149
  gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand)
150
  if gRNA_sites:
 
154
  # Return the data, fetched sequence, and possibly exon/CDS data
155
  return all_data, gene_sequence, exons, cds_list
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def create_genbank_features(formatted_data):
158
  features = []
159
  for data in formatted_data:
crisprTool.md CHANGED
@@ -1,4 +1,5 @@
1
  CRISPR Online Tool for Cas9/Cas12/Cas13d Efficacy Prediction
2
 
3
 
4
- You are using version 0.2b of this tool.
 
 
1
  CRISPR Online Tool for Cas9/Cas12/Cas13d Efficacy Prediction
2
 
3
 
4
+ You are using version 1.0b of this tool.
5
+ Note: Once you click the download button, the page will automatically refresh