Spaces:

LeMaterial
/

materials_explorer

Running

App Files Files Community

Ramlaoui commited on 3 days ago

Commit

a10ccb7

•

1 Parent(s): 3daf8bc

Use sparse matrices

Browse files

Files changed (2) hide show

app.py +5 -1
data_utils.py +24 -21

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ from components import (
     get_upload_div,
 )
 from data_utils import (
-    build_embeddings_index,
     build_formula_index,
     get_crystal_plot,
     get_dataset,
@@ -29,6 +28,11 @@ from data_utils import (
 EMPTY_DATA = False
 CACHE_PATH = None
 dataset = get_dataset()
 display_columns_query = [

     get_upload_div,
 )
 from data_utils import (
     build_formula_index,
     get_crystal_plot,
     get_dataset,
 EMPTY_DATA = False
 CACHE_PATH = None
+if CACHE_PATH is not None:
+    import os
+    os.makedirs(CACHE_PATH, exist_ok=True)
 dataset = get_dataset()
 display_columns_query = [

data_utils.py CHANGED Viewed

@@ -72,6 +72,7 @@ mapping_table_idx_dataset_idx = {}
 def build_formula_index(dataset, index_range=None, cache_path=None, empty_data=False):
     if empty_data:
         return np.zeros((1, 1)), {}
@@ -80,40 +81,42 @@ def build_formula_index(dataset, index_range=None, cache_path=None, empty_data=F
         use_dataset = dataset.select(index_range)
     # Preprocessing step to create an index for the dataset
-    if cache_path is not None:
-        train_df = pickle.load(open(f"{cache_path}/train_df.pkl", "rb"))
-        dataset_index = pickle.load(open(f"{cache_path}/dataset_index.pkl", "rb"))
     else:
         train_df = use_dataset.select_columns(
-            ["chemical_formula_descriptive", "immutable_id"]
         ).to_pandas()
-        pattern = re.compile(r"(?P<element>[A-Z][a-z]?)(?P<count>\d*)")
-        extracted = train_df["chemical_formula_descriptive"].str.extractall(pattern)
-        extracted["count"] = extracted["count"].replace("", "1").astype(int)
-        wide_df = (
-            extracted.reset_index().pivot_table(  # Move index to columns for pivoting
-                index="level_0",  # original row index
-                columns="element",
-                values="count",
-                aggfunc="sum",
-                fill_value=0,
-            )
-        )
-        all_elements = [el.symbol for el in periodictable.elements]  # full element list
-        wide_df = wide_df.reindex(columns=all_elements, fill_value=0)
-        dataset_index = wide_df.values
         dataset_index = dataset_index / np.sum(dataset_index, axis=1)[:, None]
         dataset_index = (
             dataset_index / np.linalg.norm(dataset_index, axis=1)[:, None]
         )  # Normalize vectors
     immutable_id_to_idx = train_df["immutable_id"].to_dict()
     immutable_id_to_idx = {v: k for k, v in immutable_id_to_idx.items()}
     return dataset_index, immutable_id_to_idx
@@ -162,7 +165,7 @@ def search_materials(
             numb = int(numb) if numb else 1
             query_vector[map_periodic_table[el]] = numb
-    similarity = np.dot(dataset_index, query_vector) / (np.linalg.norm(query_vector))
     indices = np.argsort(similarity)[::-1][:top_k]
     options = [dataset[int(i)] for i in indices]

 def build_formula_index(dataset, index_range=None, cache_path=None, empty_data=False):
+    print("Building formula index")
     if empty_data:
         return np.zeros((1, 1)), {}
         use_dataset = dataset.select(index_range)
     # Preprocessing step to create an index for the dataset
+    from scipy.sparse import load_npz
+    if cache_path is not None and os.path.exists(f"{cache_path}/train_df.pkl"):
+        train_df = pickle.load(open(f"{cache_path}/train_df.pkl", "rb"))
+        dataset_index = load_npz(f"{cache_path}/dataset_index.npz")
     else:
         train_df = use_dataset.select_columns(
+            ["species_at_sites", "immutable_id", "functional"]
         ).to_pandas()
+        import tqdm
+        all_elements = {
+            str(el.symbol): i for i, el in enumerate(periodictable.elements)
+        }  # full element list
+        dataset_index = np.zeros((len(train_df), len(all_elements)))
+        for idx, species in tqdm.tqdm(enumerate(train_df["species_at_sites"].values)):
+            for el in species:
+                dataset_index[idx, all_elements[el]] += 1
         dataset_index = dataset_index / np.sum(dataset_index, axis=1)[:, None]
         dataset_index = (
             dataset_index / np.linalg.norm(dataset_index, axis=1)[:, None]
         )  # Normalize vectors
+        from scipy.sparse import csr_matrix, save_npz
+        dataset_index = csr_matrix(dataset_index)
+        if cache_path is not None:
+            pickle.dump(train_df, open(f"{cache_path}/train_df.pkl", "wb"))
+            save_npz(f"{cache_path}/dataset_index.npz", dataset_index)
     immutable_id_to_idx = train_df["immutable_id"].to_dict()
+    del train_df
     immutable_id_to_idx = {v: k for k, v in immutable_id_to_idx.items()}
     return dataset_index, immutable_id_to_idx
             numb = int(numb) if numb else 1
             query_vector[map_periodic_table[el]] = numb
+    similarity = dataset_index.dot(query_vector) / (np.linalg.norm(query_vector))
     indices = np.argsort(similarity)[::-1][:top_k]
     options = [dataset[int(i)] for i in indices]