seanpedrickcase commited on
Commit
cc495e1
·
1 Parent(s): 49e0db8

Rearranged functions for embeddings creation to be compatible with zero GPU space. Updated packages.

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🚀
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.8.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -76,7 +76,7 @@ with app:
76
 
77
  with gr.Accordion("Clean data", open = False):
78
  with gr.Row():
79
- clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
80
  drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
81
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
82
  #with gr.Row():
 
76
 
77
  with gr.Accordion("Clean data", open = False):
78
  with gr.Row():
79
+ clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, large numbers, emails, postcodes (UK).")
80
  drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
81
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
82
  #with gr.Row():
funcs/clean_funcs.py CHANGED
@@ -2,6 +2,7 @@ import re
2
  import string
3
  import unicodedata
4
  import polars as pl
 
5
  import gradio as gr
6
 
7
  # Adding custom words to the stopwords
@@ -15,15 +16,18 @@ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
15
  non_ascii_pattern = r'[^\x00-\x7F]+'
16
  email_pattern_regex = r'\S*@\S*\s?'
17
  num_pattern_regex = r'[0-9]+'
18
- nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
 
 
19
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
20
  multiple_spaces_regex = r'\s{2,}'
21
  multiple_new_lines_regex = r'(\r\n|\n)+'
 
22
 
23
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
24
 
25
  for text in texts:
26
- if not text:
27
  text = ""
28
 
29
  # Normalize unicode characters to decompose any special forms
@@ -53,10 +57,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
53
  (html_start_pattern_end_dots_regex, ' '),
54
  (non_ascii_pattern, ' '),
55
  (email_pattern_regex, ' '),
56
- (nums_two_more_regex, ' '),
57
  (postcode_pattern_regex, ' '),
58
  (multiple_spaces_regex, ' '),
59
- (r"(\p{P})\p{P}+", "${1}")
 
 
60
  ]
61
 
62
  # Apply each regex replacement
 
2
  import string
3
  import unicodedata
4
  import polars as pl
5
+ import pandas as pd
6
  import gradio as gr
7
 
8
  # Adding custom words to the stopwords
 
16
  non_ascii_pattern = r'[^\x00-\x7F]+'
17
  email_pattern_regex = r'\S*@\S*\s?'
18
  num_pattern_regex = r'[0-9]+'
19
+ and_sign_regex = r'&'
20
+ forward_slash_regex = r'/'
21
+ nums_five_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{5,}\b|\b[0-9]+\s[0-9]+\b' # Should match five digit numbers or more, and also if there are full stops or commas in between
22
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
23
  multiple_spaces_regex = r'\s{2,}'
24
  multiple_new_lines_regex = r'(\r\n|\n)+'
25
+ multiple_punctuation_regex = r"(\p{P})\p{P}+"
26
 
27
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
28
 
29
  for text in texts:
30
+ if not text or pd.isnull(text):
31
  text = ""
32
 
33
  # Normalize unicode characters to decompose any special forms
 
57
  (html_start_pattern_end_dots_regex, ' '),
58
  (non_ascii_pattern, ' '),
59
  (email_pattern_regex, ' '),
60
+ (nums_five_more_regex, ' '),
61
  (postcode_pattern_regex, ' '),
62
  (multiple_spaces_regex, ' '),
63
+ (multiple_punctuation_regex, "${1}"),
64
+ (and_sign_regex, 'and')#,
65
+ #(forward_slash_regex, 'or')
66
  ]
67
 
68
  # Apply each regex replacement
funcs/embeddings.py CHANGED
@@ -1,7 +1,12 @@
1
  import time
2
  import numpy as np
3
  import os
 
4
  from torch import cuda, backends, version
 
 
 
 
5
 
6
  # Check for torch cuda
7
  # If you want to disable cuda for testing purposes
@@ -18,11 +23,9 @@ else:
18
  torch_device = "cpu"
19
  high_quality_mode = "No"
20
 
21
- print("Device used is: ", torch_device)
22
 
23
-
24
-
25
- def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embedding_model, embeddings_super_compress: str, high_quality_mode_opt: str) -> np.ndarray:
26
  """
27
  Create or load embeddings for the given documents.
28
 
@@ -30,7 +33,6 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
30
  docs (list): List of documents to embed.
31
  file_list (list): List of file names to check for existing embeddings.
32
  embeddings_out (np.ndarray): Array to store the embeddings.
33
- embedding_model: Model used to generate embeddings.
34
  embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
35
  high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
36
 
@@ -38,6 +40,33 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
38
  np.ndarray: The generated or loaded embeddings.
39
  """
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # If no embeddings found, make or load in
42
  if embeddings_out.size == 0:
43
  print("Embeddings not found. Loading or generating new ones.")
@@ -84,9 +113,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
84
  embeddings_out = np.round(embeddings_out, 3)
85
  embeddings_out *= 100
86
 
87
- return embeddings_out
88
 
89
  else:
90
  print("Found pre-loaded embeddings.")
91
 
92
- return embeddings_out
 
1
  import time
2
  import numpy as np
3
  import os
4
+ import spaces
5
  from torch import cuda, backends, version
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn.pipeline import make_pipeline
8
+ from sklearn.decomposition import TruncatedSVD
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
 
11
  # Check for torch cuda
12
  # If you want to disable cuda for testing purposes
 
23
  torch_device = "cpu"
24
  high_quality_mode = "No"
25
 
 
26
 
27
+ @spaces.GPU
28
+ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1") -> np.ndarray:
 
29
  """
30
  Create or load embeddings for the given documents.
31
 
 
33
  docs (list): List of documents to embed.
34
  file_list (list): List of file names to check for existing embeddings.
35
  embeddings_out (np.ndarray): Array to store the embeddings.
 
36
  embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
37
  high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
38
 
 
40
  np.ndarray: The generated or loaded embeddings.
41
  """
42
 
43
+ if high_quality_mode_opt == "Yes":
44
+ # Define a list of possible local locations to search for the model
45
+ local_embeddings_locations = [
46
+ "model/embed/", # Potential local location
47
+ "/model/embed/", # Potential location in Docker container
48
+ "/home/user/app/model/embed/" # This is inside a Docker container
49
+ ]
50
+
51
+ # Attempt to load the model from each local location
52
+ for location in local_embeddings_locations:
53
+ try:
54
+ embedding_model = SentenceTransformer(location)#, truncate_dim=512)
55
+ print(f"Found local model installation at: {location}")
56
+ break # Exit the loop if the model is found
57
+ except Exception as e:
58
+ print(f"Failed to load model from {location}: {e}")
59
+ continue
60
+ else:
61
+ # If the loop completes without finding the model in any local location
62
+ embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
63
+ print("Could not find local model installation. Downloading from Huggingface")
64
+ else:
65
+ embedding_model = make_pipeline(
66
+ TfidfVectorizer(),
67
+ TruncatedSVD(100, random_state=random_seed)
68
+ )
69
+
70
  # If no embeddings found, make or load in
71
  if embeddings_out.size == 0:
72
  print("Embeddings not found. Loading or generating new ones.")
 
113
  embeddings_out = np.round(embeddings_out, 3)
114
  embeddings_out *= 100
115
 
116
+ return embeddings_out, embedding_model
117
 
118
  else:
119
  print("Found pre-loaded embeddings.")
120
 
121
+ return embeddings_out, embedding_model
funcs/topic_core_funcs.py CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
7
  import numpy as np
8
  import time
9
  from bertopic import BERTopic
 
10
 
11
  from typing import List, Type, Union
12
  PandasDataFrame = Type[pd.DataFrame]
@@ -17,13 +18,7 @@ from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder
17
  from funcs.embeddings import make_or_load_embeddings, torch_device
18
  from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
19
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
20
-
21
  from sklearn.feature_extraction.text import CountVectorizer
22
-
23
- from sentence_transformers import SentenceTransformer
24
- from sklearn.pipeline import make_pipeline
25
- from sklearn.decomposition import TruncatedSVD
26
- from sklearn.feature_extraction.text import TfidfVectorizer
27
  import funcs.anonymiser as anon
28
  from umap import UMAP
29
 
@@ -96,84 +91,88 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
96
  output_list = []
97
  #file_list = [string.name for string in in_files]
98
 
99
- in_colnames_list_first = in_colnames[0]
100
 
101
- # Reset original index to a new column so you can link it to data outputted from cleaning
102
- if not "original_index" in data.columns:
103
- data = data.reset_index(names="original_index")
104
 
105
- if clean_text == "Yes":
106
- clean_tic = time.perf_counter()
107
- print("Starting data clean.")
108
 
109
- data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
 
 
110
 
111
- if '_clean' not in data_file_name_no_ext:
112
- data_file_name_no_ext = data_file_name_no_ext + "_clean"
 
113
 
114
- clean_toc = time.perf_counter()
115
- clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
116
- print(clean_time_out)
117
 
118
- # Clean custom regex if exists
119
- if not custom_regex.empty:
120
- data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
121
 
122
- if '_clean' not in data_file_name_no_ext:
123
- data_file_name_no_ext = data_file_name_no_ext + "_clean"
124
-
125
 
126
- if drop_duplicate_text == "Yes":
127
- progress(0.3, desc= "Drop duplicates - remove short texts")
 
128
 
129
- data_file_name_no_ext = data_file_name_no_ext + "_dedup"
 
 
130
 
131
- #print("Removing duplicates and short entries from data")
132
- #print("Data shape before: ", data.shape)
133
- data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
134
- data = data[data[in_colnames_list_first].str.len() >= 50]
135
- data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
136
-
137
- #print("Data shape after duplicate/null removal: ", data.shape)
138
 
139
- if anonymise_drop == "Yes":
140
- progress(0.4, desc= "Anonymising data")
141
 
142
- if '_anon' not in data_file_name_no_ext:
143
- data_file_name_no_ext = data_file_name_no_ext + "_anon"
 
 
 
 
 
144
 
145
- anon_tic = time.perf_counter()
146
-
147
- data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
148
 
149
- data[in_colnames_list_first] = data_anon_col
 
150
 
151
- print(anonymisation_success)
 
 
152
 
153
- anon_toc = time.perf_counter()
154
- time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
155
 
156
- print(time_out)
157
 
158
- if sentence_split_drop == "Yes":
159
- progress(0.6, desc= "Splitting text into sentences")
160
 
161
- if '_split' not in data_file_name_no_ext:
162
- data_file_name_no_ext = data_file_name_no_ext + "_split"
163
 
164
- anon_tic = time.perf_counter()
165
-
166
- data = expand_sentences_spacy(data, in_colnames_list_first)
167
- data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
168
- data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
169
- data.reset_index(inplace=True, drop=True)
 
 
 
 
 
 
170
 
171
- anon_toc = time.perf_counter()
172
- time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
173
 
174
- print(time_out)
175
 
176
- data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
177
 
178
  out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
179
  data.to_csv(out_data_name)
@@ -299,27 +298,6 @@ def extract_topics(
299
  if high_quality_mode == "Yes":
300
  print("Using high quality embedding model")
301
 
302
- # Define a list of possible local locations to search for the model
303
- local_embeddings_locations = [
304
- "model/embed/", # Potential local location
305
- "/model/embed/", # Potential location in Docker container
306
- "/home/user/app/model/embed/" # This is inside a Docker container
307
- ]
308
-
309
- # Attempt to load the model from each local location
310
- for location in local_embeddings_locations:
311
- try:
312
- embedding_model = SentenceTransformer(location)#, truncate_dim=512)
313
- print(f"Found local model installation at: {location}")
314
- break # Exit the loop if the model is found
315
- except Exception as e:
316
- print(f"Failed to load model from {location}: {e}")
317
- continue
318
- else:
319
- # If the loop completes without finding the model in any local location
320
- embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
321
- print("Could not find local model installation. Downloading from Huggingface")
322
-
323
  #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
324
 
325
  # If tfidf embeddings currently exist, wipe these empty
@@ -329,15 +307,15 @@ def extract_topics(
329
  embeddings_type_state = "large"
330
 
331
  # UMAP model uses Bertopic defaults
332
- umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
333
 
334
  else:
335
  print("Choosing low resource TF-IDF model.")
336
 
337
- embedding_model = make_pipeline(
338
- TfidfVectorizer(),
339
- TruncatedSVD(100, random_state=random_seed)
340
- )
341
 
342
  # If large embeddings currently exist, wipe these empty, then rename embeddings type
343
  if embeddings_type_state == "large":
@@ -346,10 +324,10 @@ def extract_topics(
346
  embeddings_type_state = "tfidf"
347
 
348
  #umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
349
- # UMAP model uses Bertopic defaults
350
- umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
351
 
352
- embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
353
 
354
  # If you want to save your embedding files
355
  if return_intermediate_files == "Yes":
 
7
  import numpy as np
8
  import time
9
  from bertopic import BERTopic
10
+ import spaces
11
 
12
  from typing import List, Type, Union
13
  PandasDataFrame = Type[pd.DataFrame]
 
18
  from funcs.embeddings import make_or_load_embeddings, torch_device
19
  from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
20
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
 
21
  from sklearn.feature_extraction.text import CountVectorizer
 
 
 
 
 
22
  import funcs.anonymiser as anon
23
  from umap import UMAP
24
 
 
91
  output_list = []
92
  #file_list = [string.name for string in in_files]
93
 
94
+ for in_colnames_list_first in in_colnames:
95
 
96
+ print("Cleaning column:", in_colnames_list_first)
 
 
97
 
98
+ #in_colnames_list_first = in_colnames[0]
 
 
99
 
100
+ # Reset original index to a new column so you can link it to data outputted from cleaning
101
+ if not "original_index" in data.columns:
102
+ data = data.reset_index(names="original_index")
103
 
104
+ if clean_text == "Yes":
105
+ clean_tic = time.perf_counter()
106
+ print("Starting data clean.")
107
 
108
+ data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
 
 
109
 
110
+ if '_clean' not in data_file_name_no_ext:
111
+ data_file_name_no_ext = data_file_name_no_ext + "_clean"
 
112
 
113
+ clean_toc = time.perf_counter()
114
+ clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
115
+ print(clean_time_out)
116
 
117
+ # Clean custom regex if exists
118
+ if not custom_regex.empty:
119
+ data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
120
 
121
+ if '_clean' not in data_file_name_no_ext:
122
+ data_file_name_no_ext = data_file_name_no_ext + "_clean"
123
+
124
 
125
+ if drop_duplicate_text == "Yes":
126
+ progress(0.3, desc= "Drop duplicates - remove short texts")
 
 
 
 
 
127
 
128
+ data_file_name_no_ext = data_file_name_no_ext + "_dedup"
 
129
 
130
+ #print("Removing duplicates and short entries from data")
131
+ #print("Data shape before: ", data.shape)
132
+ data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
133
+ data = data[data[in_colnames_list_first].str.len() >= 50]
134
+ data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
135
+
136
+ #print("Data shape after duplicate/null removal: ", data.shape)
137
 
138
+ if anonymise_drop == "Yes":
139
+ progress(0.4, desc= "Anonymising data")
 
140
 
141
+ if '_anon' not in data_file_name_no_ext:
142
+ data_file_name_no_ext = data_file_name_no_ext + "_anon"
143
 
144
+ anon_tic = time.perf_counter()
145
+
146
+ data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
147
 
148
+ data[in_colnames_list_first] = data_anon_col
 
149
 
150
+ print(anonymisation_success)
151
 
152
+ anon_toc = time.perf_counter()
153
+ time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
154
 
155
+ print(time_out)
 
156
 
157
+ if sentence_split_drop == "Yes":
158
+ progress(0.6, desc= "Splitting text into sentences")
159
+
160
+ if '_split' not in data_file_name_no_ext:
161
+ data_file_name_no_ext = data_file_name_no_ext + "_split"
162
+
163
+ anon_tic = time.perf_counter()
164
+
165
+ data = expand_sentences_spacy(data, in_colnames_list_first)
166
+ data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
167
+ data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
168
+ data.reset_index(inplace=True, drop=True)
169
 
170
+ anon_toc = time.perf_counter()
171
+ time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
172
 
173
+ print(time_out)
174
 
175
+ data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
176
 
177
  out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
178
  data.to_csv(out_data_name)
 
298
  if high_quality_mode == "Yes":
299
  print("Using high quality embedding model")
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
302
 
303
  # If tfidf embeddings currently exist, wipe these empty
 
307
  embeddings_type_state = "large"
308
 
309
  # UMAP model uses Bertopic defaults
310
+ #umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
311
 
312
  else:
313
  print("Choosing low resource TF-IDF model.")
314
 
315
+ # embedding_model = make_pipeline(
316
+ # TfidfVectorizer(),
317
+ # TruncatedSVD(100, random_state=random_seed)
318
+ # )
319
 
320
  # If large embeddings currently exist, wipe these empty, then rename embeddings type
321
  if embeddings_type_state == "large":
 
324
  embeddings_type_state = "tfidf"
325
 
326
  #umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
327
+ # UMAP model uses Bertopic defaults
328
+ umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
329
 
330
+ embeddings_out, embedding_model = make_or_load_embeddings(docs, file_list, embeddings_out, embeddings_super_compress, high_quality_mode, embeddings_name)
331
 
332
  # If you want to save your embedding files
333
  if return_intermediate_files == "Yes":
requirements.txt CHANGED
@@ -3,11 +3,10 @@ pandas==2.2.3
3
  plotly==5.24.1
4
  scikit-learn==1.5.2
5
  umap-learn==0.5.7
6
- gradio==5.6.0
7
- boto3==1.35.64
8
  transformers==4.46.3
9
  accelerate==1.1.1
10
- torch==2.5.1
11
  bertopic==0.16.4
12
  spacy==3.8.0
13
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
@@ -18,6 +17,9 @@ presidio_analyzer==2.2.355
18
  presidio_anonymizer==2.2.355
19
  scipy
20
  polars
21
- sentence-transformers==3.2.0
22
- llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
23
- #numpy==1.26.4
 
 
 
 
3
  plotly==5.24.1
4
  scikit-learn==1.5.2
5
  umap-learn==0.5.7
6
+ gradio==5.8.0
7
+ boto3==1.35.71
8
  transformers==4.46.3
9
  accelerate==1.1.1
 
10
  bertopic==0.16.4
11
  spacy==3.8.0
12
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 
17
  presidio_anonymizer==2.2.355
18
  scipy
19
  polars
20
+ sentence-transformers==3.3.1
21
+ torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
22
+ #llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
23
+ # Specify exact llama_cpp wheel for huggingface compatibility
24
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
25
+ numpy==1.26.4
requirements_aws.txt CHANGED
@@ -6,7 +6,7 @@ umap-learn==0.5.7
6
  boto3==1.35.64
7
  spacy==3.8.0
8
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
9
- gradio==5.6.0
10
  pyarrow
11
  openpyxl
12
  Faker
 
6
  boto3==1.35.64
7
  spacy==3.8.0
8
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
9
+ gradio==5.8.0
10
  pyarrow
11
  openpyxl
12
  Faker
requirements_gpu.txt CHANGED
@@ -18,8 +18,7 @@ presidio_analyzer==2.2.355
18
  presidio_anonymizer==2.2.355
19
  scipy
20
  polars
21
- llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
22
- torch --index-url https://download.pytorch.org/whl/cu121
23
- sentence-transformers==3.2.0
24
- #numpy==1.26.4
25
 
 
18
  presidio_anonymizer==2.2.355
19
  scipy
20
  polars
21
+ llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
22
+ sentence-transformers==3.3.1
23
+ numpy==1.26.4
 
24