roni commited on
Commit
1694358
·
1 Parent(s): 5e7a3eb

using metadata instead of fetching it from the internet

Browse files
Files changed (3) hide show
  1. app.py +8 -32
  2. concurrency.py +0 -22
  3. protein_viz.py +0 -56
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import gradio as gr
2
 
3
- from concurrency import execute_multithread
4
  from get_index import get_engines
5
- from protein_viz import get_gene_name, get_protein_name, render_html
6
 
7
  index_repo = "ronig/siamese_protein_index"
8
  model_repo = "ronig/protein_search_engine"
@@ -15,6 +14,7 @@ You can use it to search the full [PDB](https://www.rcsb.org/) database or in a
15
  """
16
  max_results = 100
17
 
 
18
  def search_and_display(seq, n_res, index_selection):
19
  n_res = int(limit_n_results(n_res))
20
  engine = engines[index_selection]
@@ -47,48 +47,24 @@ def update_dropdown_menu(search_res):
47
 
48
  def format_search_results(raw_search_results):
49
  formatted_search_results = {}
50
- for key, value in execute_multithread(
51
- func=format_search_result,
52
- inputs=({"raw_result": res} for res in raw_search_results),
53
- n_workers=len(raw_search_results),
54
- ):
55
  formatted_search_results[key] = value
56
  return formatted_search_results
57
 
58
 
59
- def format_search_result(raw_result):
60
- is_pdb = "pdb_name" in raw_result
61
- if is_pdb:
62
- key, value = parse_pdb_search_result(raw_result)
63
- else:
64
- key, value = parse_fasta_search_result(raw_result)
65
- return key, value
66
-
67
-
68
- def parse_fasta_search_result(raw_result):
69
- gene = parse_gene_from_fasta_entry(raw_result["description"])
70
- key = f"Gene: {gene}"
71
- value = raw_result["score"]
72
- return key, value
73
-
74
-
75
  def parse_pdb_search_result(raw_result):
76
  prot = raw_result["pdb_name"]
77
  chain = raw_result["chain_id"]
78
  value = raw_result["score"]
79
- gene_name, species = get_gene_name(pdb_id=prot, chain_id=chain)
 
80
  key = f"PDB: {prot}.{chain}"
81
- if gene_name is not None:
82
- key += f" | Gene: {gene_name} | Organism: {species}"
83
  return key, value
84
 
85
 
86
- def parse_gene_from_fasta_entry(description):
87
- after = description.split("GN=")[1]
88
- gene = after.split(" ")[0]
89
- return gene
90
-
91
-
92
  def switch_viz(new_choice):
93
  if new_choice is None:
94
  html = ""
 
1
  import gradio as gr
2
 
 
3
  from get_index import get_engines
4
+ from protein_viz import get_protein_name, render_html
5
 
6
  index_repo = "ronig/siamese_protein_index"
7
  model_repo = "ronig/protein_search_engine"
 
14
  """
15
  max_results = 100
16
 
17
+
18
  def search_and_display(seq, n_res, index_selection):
19
  n_res = int(limit_n_results(n_res))
20
  engine = engines[index_selection]
 
47
 
48
  def format_search_results(raw_search_results):
49
  formatted_search_results = {}
50
+ for res in raw_search_results:
51
+ key, value = parse_pdb_search_result(res)
 
 
 
52
  formatted_search_results[key] = value
53
  return formatted_search_results
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def parse_pdb_search_result(raw_result):
57
  prot = raw_result["pdb_name"]
58
  chain = raw_result["chain_id"]
59
  value = raw_result["score"]
60
+ gene_names = raw_result["genes"]
61
+ species = raw_result["organism"]
62
  key = f"PDB: {prot}.{chain}"
63
+ if gene_names is not None:
64
+ key += f" | Genes: {gene_names} | Organism: {species}"
65
  return key, value
66
 
67
 
 
 
 
 
 
 
68
  def switch_viz(new_choice):
69
  if new_choice is None:
70
  html = ""
concurrency.py DELETED
@@ -1,22 +0,0 @@
1
- import concurrent.futures
2
- import itertools
3
- from typing import Callable, Iterable
4
-
5
-
6
- def execute_multithread(func: Callable, inputs: Iterable, n_workers):
7
- with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
8
- futures = {
9
- executor.submit(func, **task)
10
- for task in itertools.islice(inputs, n_workers)
11
- }
12
-
13
- while futures:
14
- done, futures = concurrent.futures.wait(
15
- futures, return_when=concurrent.futures.FIRST_COMPLETED
16
- )
17
-
18
- for future in done:
19
- yield future.result()
20
-
21
- for task in itertools.islice(inputs, len(done)):
22
- futures.add(executor.submit(func, **task))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
protein_viz.py CHANGED
@@ -30,62 +30,6 @@ def render_html(pdb_id, chain):
30
  return iframe
31
 
32
 
33
- def get_gene_name(pdb_id, chain_id):
34
- entity_id = get_polymer_entity_id(chain_id, pdb_id)
35
- gene_name, species = get_gene_name_from_polymer_entity(
36
- pdb_id=pdb_id, entity_id=entity_id
37
- )
38
- return gene_name, species
39
-
40
-
41
- def get_polymer_entity_id(chain_id, pdb_id):
42
- url = (
43
- f"https://data.rcsb.org/rest/v1/core/"
44
- f"polymer_entity_instance/{pdb_id}/{chain_id}"
45
- )
46
- response = requests.get(url, timeout=1)
47
- if response.ok:
48
- res_data = response.json()
49
- entity_id = int(
50
- res_data["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"]
51
- )
52
- else:
53
- entity_id = None
54
- return entity_id
55
-
56
-
57
- def get_gene_name_from_polymer_entity(pdb_id, entity_id):
58
- gene_name, species = None, None
59
- if entity_id:
60
- url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
61
- response = requests.get(url, timeout=1)
62
- if response.ok:
63
- res_data = response.json()
64
- uniprot_id = _extract_uniprot_id(res_data)
65
- source_organism = res_data.get("rcsb_entity_source_organism", [{}])[0]
66
- gene_name = source_organism.get("rcsb_gene_name", [{}])[0].get("value")
67
- species = source_organism.get("scientific_name")
68
- if gene_name is None and uniprot_id is not None:
69
- gene_name = get_gene_name_from_uniprot(uniprot_id)
70
- return gene_name, species
71
-
72
-
73
- def get_gene_name_from_uniprot(uniprot_id):
74
- gene_name = None
75
- url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}"
76
- response = requests.get(url, timeout=1.0)
77
- if response.ok:
78
- uniprot_data = response.json()
79
- gene_name = uniprot_data.get("genes", [{}])[0].get("geneName", {}).get("value")
80
- return gene_name
81
-
82
-
83
- def _extract_uniprot_id(res_data):
84
- ids = res_data.get("rcsb_polymer_entity_container_identifiers", {})
85
- uniprot_id = ids.get("uniprot_ids", [None])[0]
86
- return uniprot_id
87
-
88
-
89
  def get_protein_name(pdb_id: str):
90
  url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
91
  response = requests.get(url, timeout=1)
 
30
  return iframe
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def get_protein_name(pdb_id: str):
34
  url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
35
  response = requests.get(url, timeout=1)