roni commited on
Commit
1d11011
·
1 Parent(s): b2a3d53

changing gene info source

Browse files
Files changed (3) hide show
  1. app.py +19 -19
  2. concurrency.py +22 -0
  3. protein_viz.py +37 -8
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
 
 
3
  from get_index import get_engine
4
- from protein_viz import get_gene_names, get_protein_name, render_html
5
 
6
  index_repo = "ronig/siamese_protein_index"
7
  model_repo = "ronig/protein_search_engine"
@@ -23,12 +24,7 @@ def limit_n_results(n):
23
  def update_dropdown_menu(search_res):
24
  choices = []
25
  for row in search_res:
26
- gene = row["gene_ids"][0]
27
- if gene != "Unknown":
28
- choice_parts = [row["pdb_name"], row["chain_id"], gene]
29
- else:
30
- choice_parts = [row["pdb_name"], row["chain_id"]]
31
- choice = ",".join(choice_parts)
32
  choices.append(choice)
33
 
34
  return gr.Dropdown.update(
@@ -38,18 +34,26 @@ def update_dropdown_menu(search_res):
38
 
39
  def format_search_results(raw_search_results):
40
  formatted_search_results = {}
41
- for res in raw_search_results:
42
- prot = res["pdb_name"]
43
- chain = res["chain_id"]
44
- value = res["score"]
45
- genes = ",".join(res["gene_ids"])
46
- key = f"PDB: {prot} | Chain: {chain}"
47
- if genes != "Unknown":
48
- key += f" | Gene: {genes}"
49
  formatted_search_results[key] = value
50
  return formatted_search_results
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
53
  def switch_viz(new_choice):
54
  choice_parts = new_choice.split(",")
55
  pdb_id, chain = choice_parts[0], choice_parts[1]
@@ -57,10 +61,6 @@ def switch_viz(new_choice):
57
  protein_name = get_protein_name(pdb_id)
58
 
59
  new_value = f"""**PDB Title**: {protein_name}"""
60
- if len(choice_parts) > 2:
61
- gene = choice_parts[2]
62
- gene_name = get_gene_names([gene])[0]
63
- new_value += f"""\n\n**Gene Name**: {gene_name.title()}"""
64
 
65
  description_update = gr.Markdown.update(value=new_value, visible=True)
66
  return render_html(pdb_id=pdb_id, chain=chain), title_update, description_update
 
1
  import gradio as gr
2
 
3
+ from concurrency import execute_multithread
4
  from get_index import get_engine
5
+ from protein_viz import get_gene_name, get_protein_name, render_html
6
 
7
  index_repo = "ronig/siamese_protein_index"
8
  model_repo = "ronig/protein_search_engine"
 
24
  def update_dropdown_menu(search_res):
25
  choices = []
26
  for row in search_res:
27
+ choice = ",".join([row["pdb_name"], row["chain_id"]])
 
 
 
 
 
28
  choices.append(choice)
29
 
30
  return gr.Dropdown.update(
 
34
 
35
  def format_search_results(raw_search_results):
36
  formatted_search_results = {}
37
+ for key, value in execute_multithread(
38
+ func=format_search_result,
39
+ inputs=({"raw_result": res} for res in raw_search_results),
40
+ n_workers=len(raw_search_results),
41
+ ):
 
 
 
42
  formatted_search_results[key] = value
43
  return formatted_search_results
44
 
45
 
46
+ def format_search_result(raw_result):
47
+ prot = raw_result["pdb_name"]
48
+ chain = raw_result["chain_id"]
49
+ value = raw_result["score"]
50
+ gene_name = get_gene_name(pdb_id=prot, chain_id=chain)
51
+ key = f"PDB: {prot} | Chain: {chain}"
52
+ if gene_name != "Unknown":
53
+ key += f" | Gene: {gene_name}"
54
+ return key, value
55
+
56
+
57
  def switch_viz(new_choice):
58
  choice_parts = new_choice.split(",")
59
  pdb_id, chain = choice_parts[0], choice_parts[1]
 
61
  protein_name = get_protein_name(pdb_id)
62
 
63
  new_value = f"""**PDB Title**: {protein_name}"""
 
 
 
 
64
 
65
  description_update = gr.Markdown.update(value=new_value, visible=True)
66
  return render_html(pdb_id=pdb_id, chain=chain), title_update, description_update
concurrency.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import concurrent.futures
2
+ import itertools
3
+ from typing import Callable, Iterable
4
+
5
+
6
+ def execute_multithread(func: Callable, inputs: Iterable, n_workers):
7
+ with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
8
+ futures = {
9
+ executor.submit(func, **task)
10
+ for task in itertools.islice(inputs, n_workers)
11
+ }
12
+
13
+ while futures:
14
+ done, futures = concurrent.futures.wait(
15
+ futures, return_when=concurrent.futures.FIRST_COMPLETED
16
+ )
17
+
18
+ for future in done:
19
+ yield future.result()
20
+
21
+ for task in itertools.islice(inputs, len(done)):
22
+ futures.add(executor.submit(func, **task))
protein_viz.py CHANGED
@@ -1,6 +1,3 @@
1
- from typing import List
2
-
3
- import mygene
4
  import requests
5
 
6
 
@@ -33,11 +30,42 @@ def render_html(pdb_id, chain):
33
  return iframe
34
 
35
 
36
- def get_gene_names(genes: List[str]):
37
- mg = mygene.MyGeneInfo()
38
- ginfo = mg.querymany(genes, scopes="ensembl.gene", verbose=False, fields="name")
39
- gene_names = [gene["name"] for gene in ginfo]
40
- return gene_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def get_protein_name(pdb_id: str):
@@ -48,4 +76,5 @@ def get_protein_name(pdb_id: str):
48
  protein_name = data["struct"]["title"]
49
  else:
50
  protein_name = "Unknown"
 
51
  return protein_name
 
 
 
 
1
  import requests
2
 
3
 
 
30
  return iframe
31
 
32
 
33
+ def get_gene_name(pdb_id, chain_id):
34
+ entity_id = get_polymer_entity_id(chain_id, pdb_id)
35
+ gene_name = get_gene_name_from_polymer_entity(pdb_id=pdb_id, entity_id=entity_id)
36
+ return gene_name
37
+
38
+
39
+ def get_polymer_entity_id(chain_id, pdb_id):
40
+ url = (
41
+ f"https://data.rcsb.org/rest/v1/core/"
42
+ f"polymer_entity_instance/{pdb_id}/{chain_id}"
43
+ )
44
+ response = requests.get(url, timeout=1)
45
+ if response.ok:
46
+ res_data = response.json()
47
+ entity_id = int(
48
+ res_data["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"]
49
+ )
50
+ else:
51
+ entity_id = None
52
+ return entity_id
53
+
54
+
55
+ def get_gene_name_from_polymer_entity(pdb_id, entity_id):
56
+ if entity_id:
57
+ url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
58
+ response = requests.get(url, timeout=1)
59
+ if response.ok:
60
+ res_data = response.json()
61
+ try:
62
+ gene_name = res_data["rcsb_entity_source_organism"][0][
63
+ "rcsb_gene_name"
64
+ ][0]["value"]
65
+ return gene_name
66
+ except KeyError:
67
+ pass
68
+ return "Unknown"
69
 
70
 
71
  def get_protein_name(pdb_id: str):
 
76
  protein_name = data["struct"]["title"]
77
  else:
78
  protein_name = "Unknown"
79
+
80
  return protein_name