Spaces:
Running
Running
roni
commited on
Commit
·
1694358
1
Parent(s):
5e7a3eb
using metadata instead of fetching it from the internet
Browse files- app.py +8 -32
- concurrency.py +0 -22
- protein_viz.py +0 -56
app.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
from concurrency import execute_multithread
|
4 |
from get_index import get_engines
|
5 |
-
from protein_viz import
|
6 |
|
7 |
index_repo = "ronig/siamese_protein_index"
|
8 |
model_repo = "ronig/protein_search_engine"
|
@@ -15,6 +14,7 @@ You can use it to search the full [PDB](https://www.rcsb.org/) database or in a
|
|
15 |
"""
|
16 |
max_results = 100
|
17 |
|
|
|
18 |
def search_and_display(seq, n_res, index_selection):
|
19 |
n_res = int(limit_n_results(n_res))
|
20 |
engine = engines[index_selection]
|
@@ -47,48 +47,24 @@ def update_dropdown_menu(search_res):
|
|
47 |
|
48 |
def format_search_results(raw_search_results):
|
49 |
formatted_search_results = {}
|
50 |
-
for
|
51 |
-
|
52 |
-
inputs=({"raw_result": res} for res in raw_search_results),
|
53 |
-
n_workers=len(raw_search_results),
|
54 |
-
):
|
55 |
formatted_search_results[key] = value
|
56 |
return formatted_search_results
|
57 |
|
58 |
|
59 |
-
def format_search_result(raw_result):
|
60 |
-
is_pdb = "pdb_name" in raw_result
|
61 |
-
if is_pdb:
|
62 |
-
key, value = parse_pdb_search_result(raw_result)
|
63 |
-
else:
|
64 |
-
key, value = parse_fasta_search_result(raw_result)
|
65 |
-
return key, value
|
66 |
-
|
67 |
-
|
68 |
-
def parse_fasta_search_result(raw_result):
|
69 |
-
gene = parse_gene_from_fasta_entry(raw_result["description"])
|
70 |
-
key = f"Gene: {gene}"
|
71 |
-
value = raw_result["score"]
|
72 |
-
return key, value
|
73 |
-
|
74 |
-
|
75 |
def parse_pdb_search_result(raw_result):
|
76 |
prot = raw_result["pdb_name"]
|
77 |
chain = raw_result["chain_id"]
|
78 |
value = raw_result["score"]
|
79 |
-
|
|
|
80 |
key = f"PDB: {prot}.{chain}"
|
81 |
-
if
|
82 |
-
key += f" |
|
83 |
return key, value
|
84 |
|
85 |
|
86 |
-
def parse_gene_from_fasta_entry(description):
|
87 |
-
after = description.split("GN=")[1]
|
88 |
-
gene = after.split(" ")[0]
|
89 |
-
return gene
|
90 |
-
|
91 |
-
|
92 |
def switch_viz(new_choice):
|
93 |
if new_choice is None:
|
94 |
html = ""
|
|
|
1 |
import gradio as gr
|
2 |
|
|
|
3 |
from get_index import get_engines
|
4 |
+
from protein_viz import get_protein_name, render_html
|
5 |
|
6 |
index_repo = "ronig/siamese_protein_index"
|
7 |
model_repo = "ronig/protein_search_engine"
|
|
|
14 |
"""
|
15 |
max_results = 100
|
16 |
|
17 |
+
|
18 |
def search_and_display(seq, n_res, index_selection):
|
19 |
n_res = int(limit_n_results(n_res))
|
20 |
engine = engines[index_selection]
|
|
|
47 |
|
48 |
def format_search_results(raw_search_results):
|
49 |
formatted_search_results = {}
|
50 |
+
for res in raw_search_results:
|
51 |
+
key, value = parse_pdb_search_result(res)
|
|
|
|
|
|
|
52 |
formatted_search_results[key] = value
|
53 |
return formatted_search_results
|
54 |
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def parse_pdb_search_result(raw_result):
|
57 |
prot = raw_result["pdb_name"]
|
58 |
chain = raw_result["chain_id"]
|
59 |
value = raw_result["score"]
|
60 |
+
gene_names = raw_result["genes"]
|
61 |
+
species = raw_result["organism"]
|
62 |
key = f"PDB: {prot}.{chain}"
|
63 |
+
if gene_names is not None:
|
64 |
+
key += f" | Genes: {gene_names} | Organism: {species}"
|
65 |
return key, value
|
66 |
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def switch_viz(new_choice):
|
69 |
if new_choice is None:
|
70 |
html = ""
|
concurrency.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
import concurrent.futures
|
2 |
-
import itertools
|
3 |
-
from typing import Callable, Iterable
|
4 |
-
|
5 |
-
|
6 |
-
def execute_multithread(func: Callable, inputs: Iterable, n_workers):
|
7 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
|
8 |
-
futures = {
|
9 |
-
executor.submit(func, **task)
|
10 |
-
for task in itertools.islice(inputs, n_workers)
|
11 |
-
}
|
12 |
-
|
13 |
-
while futures:
|
14 |
-
done, futures = concurrent.futures.wait(
|
15 |
-
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
16 |
-
)
|
17 |
-
|
18 |
-
for future in done:
|
19 |
-
yield future.result()
|
20 |
-
|
21 |
-
for task in itertools.islice(inputs, len(done)):
|
22 |
-
futures.add(executor.submit(func, **task))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
protein_viz.py
CHANGED
@@ -30,62 +30,6 @@ def render_html(pdb_id, chain):
|
|
30 |
return iframe
|
31 |
|
32 |
|
33 |
-
def get_gene_name(pdb_id, chain_id):
|
34 |
-
entity_id = get_polymer_entity_id(chain_id, pdb_id)
|
35 |
-
gene_name, species = get_gene_name_from_polymer_entity(
|
36 |
-
pdb_id=pdb_id, entity_id=entity_id
|
37 |
-
)
|
38 |
-
return gene_name, species
|
39 |
-
|
40 |
-
|
41 |
-
def get_polymer_entity_id(chain_id, pdb_id):
|
42 |
-
url = (
|
43 |
-
f"https://data.rcsb.org/rest/v1/core/"
|
44 |
-
f"polymer_entity_instance/{pdb_id}/{chain_id}"
|
45 |
-
)
|
46 |
-
response = requests.get(url, timeout=1)
|
47 |
-
if response.ok:
|
48 |
-
res_data = response.json()
|
49 |
-
entity_id = int(
|
50 |
-
res_data["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"]
|
51 |
-
)
|
52 |
-
else:
|
53 |
-
entity_id = None
|
54 |
-
return entity_id
|
55 |
-
|
56 |
-
|
57 |
-
def get_gene_name_from_polymer_entity(pdb_id, entity_id):
|
58 |
-
gene_name, species = None, None
|
59 |
-
if entity_id:
|
60 |
-
url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
|
61 |
-
response = requests.get(url, timeout=1)
|
62 |
-
if response.ok:
|
63 |
-
res_data = response.json()
|
64 |
-
uniprot_id = _extract_uniprot_id(res_data)
|
65 |
-
source_organism = res_data.get("rcsb_entity_source_organism", [{}])[0]
|
66 |
-
gene_name = source_organism.get("rcsb_gene_name", [{}])[0].get("value")
|
67 |
-
species = source_organism.get("scientific_name")
|
68 |
-
if gene_name is None and uniprot_id is not None:
|
69 |
-
gene_name = get_gene_name_from_uniprot(uniprot_id)
|
70 |
-
return gene_name, species
|
71 |
-
|
72 |
-
|
73 |
-
def get_gene_name_from_uniprot(uniprot_id):
|
74 |
-
gene_name = None
|
75 |
-
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}"
|
76 |
-
response = requests.get(url, timeout=1.0)
|
77 |
-
if response.ok:
|
78 |
-
uniprot_data = response.json()
|
79 |
-
gene_name = uniprot_data.get("genes", [{}])[0].get("geneName", {}).get("value")
|
80 |
-
return gene_name
|
81 |
-
|
82 |
-
|
83 |
-
def _extract_uniprot_id(res_data):
|
84 |
-
ids = res_data.get("rcsb_polymer_entity_container_identifiers", {})
|
85 |
-
uniprot_id = ids.get("uniprot_ids", [None])[0]
|
86 |
-
return uniprot_id
|
87 |
-
|
88 |
-
|
89 |
def get_protein_name(pdb_id: str):
|
90 |
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
91 |
response = requests.get(url, timeout=1)
|
|
|
30 |
return iframe
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def get_protein_name(pdb_id: str):
|
34 |
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
35 |
response = requests.get(url, timeout=1)
|