Spaces:
Running
Running
seanpedrickcase
commited on
Commit
·
cc495e1
1
Parent(s):
49e0db8
Rearranged functions for embeddings creation to be compatible with zero GPU space. Updated packages.
Browse files- README.md +1 -1
- app.py +1 -1
- funcs/clean_funcs.py +10 -4
- funcs/embeddings.py +36 -7
- funcs/topic_core_funcs.py +68 -90
- requirements.txt +8 -6
- requirements_aws.txt +1 -1
- requirements_gpu.txt +3 -4
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🚀
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.8.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -76,7 +76,7 @@ with app:
|
|
76 |
|
77 |
with gr.Accordion("Clean data", open = False):
|
78 |
with gr.Row():
|
79 |
-
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII,
|
80 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
81 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
|
82 |
#with gr.Row():
|
|
|
76 |
|
77 |
with gr.Accordion("Clean data", open = False):
|
78 |
with gr.Row():
|
79 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, large numbers, emails, postcodes (UK).")
|
80 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
81 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
|
82 |
#with gr.Row():
|
funcs/clean_funcs.py
CHANGED
@@ -2,6 +2,7 @@ import re
|
|
2 |
import string
|
3 |
import unicodedata
|
4 |
import polars as pl
|
|
|
5 |
import gradio as gr
|
6 |
|
7 |
# Adding custom words to the stopwords
|
@@ -15,15 +16,18 @@ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
|
15 |
non_ascii_pattern = r'[^\x00-\x7F]+'
|
16 |
email_pattern_regex = r'\S*@\S*\s?'
|
17 |
num_pattern_regex = r'[0-9]+'
|
18 |
-
|
|
|
|
|
19 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
20 |
multiple_spaces_regex = r'\s{2,}'
|
21 |
multiple_new_lines_regex = r'(\r\n|\n)+'
|
|
|
22 |
|
23 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
24 |
|
25 |
for text in texts:
|
26 |
-
if not text:
|
27 |
text = ""
|
28 |
|
29 |
# Normalize unicode characters to decompose any special forms
|
@@ -53,10 +57,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
53 |
(html_start_pattern_end_dots_regex, ' '),
|
54 |
(non_ascii_pattern, ' '),
|
55 |
(email_pattern_regex, ' '),
|
56 |
-
(
|
57 |
(postcode_pattern_regex, ' '),
|
58 |
(multiple_spaces_regex, ' '),
|
59 |
-
(
|
|
|
|
|
60 |
]
|
61 |
|
62 |
# Apply each regex replacement
|
|
|
2 |
import string
|
3 |
import unicodedata
|
4 |
import polars as pl
|
5 |
+
import pandas as pd
|
6 |
import gradio as gr
|
7 |
|
8 |
# Adding custom words to the stopwords
|
|
|
16 |
non_ascii_pattern = r'[^\x00-\x7F]+'
|
17 |
email_pattern_regex = r'\S*@\S*\s?'
|
18 |
num_pattern_regex = r'[0-9]+'
|
19 |
+
and_sign_regex = r'&'
|
20 |
+
forward_slash_regex = r'/'
|
21 |
+
nums_five_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{5,}\b|\b[0-9]+\s[0-9]+\b' # Should match five digit numbers or more, and also if there are full stops or commas in between
|
22 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
23 |
multiple_spaces_regex = r'\s{2,}'
|
24 |
multiple_new_lines_regex = r'(\r\n|\n)+'
|
25 |
+
multiple_punctuation_regex = r"(\p{P})\p{P}+"
|
26 |
|
27 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
28 |
|
29 |
for text in texts:
|
30 |
+
if not text or pd.isnull(text):
|
31 |
text = ""
|
32 |
|
33 |
# Normalize unicode characters to decompose any special forms
|
|
|
57 |
(html_start_pattern_end_dots_regex, ' '),
|
58 |
(non_ascii_pattern, ' '),
|
59 |
(email_pattern_regex, ' '),
|
60 |
+
(nums_five_more_regex, ' '),
|
61 |
(postcode_pattern_regex, ' '),
|
62 |
(multiple_spaces_regex, ' '),
|
63 |
+
(multiple_punctuation_regex, "${1}"),
|
64 |
+
(and_sign_regex, 'and')#,
|
65 |
+
#(forward_slash_regex, 'or')
|
66 |
]
|
67 |
|
68 |
# Apply each regex replacement
|
funcs/embeddings.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
import time
|
2 |
import numpy as np
|
3 |
import os
|
|
|
4 |
from torch import cuda, backends, version
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Check for torch cuda
|
7 |
# If you want to disable cuda for testing purposes
|
@@ -18,11 +23,9 @@ else:
|
|
18 |
torch_device = "cpu"
|
19 |
high_quality_mode = "No"
|
20 |
|
21 |
-
print("Device used is: ", torch_device)
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embedding_model, embeddings_super_compress: str, high_quality_mode_opt: str) -> np.ndarray:
|
26 |
"""
|
27 |
Create or load embeddings for the given documents.
|
28 |
|
@@ -30,7 +33,6 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
30 |
docs (list): List of documents to embed.
|
31 |
file_list (list): List of file names to check for existing embeddings.
|
32 |
embeddings_out (np.ndarray): Array to store the embeddings.
|
33 |
-
embedding_model: Model used to generate embeddings.
|
34 |
embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
|
35 |
high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
|
36 |
|
@@ -38,6 +40,33 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
38 |
np.ndarray: The generated or loaded embeddings.
|
39 |
"""
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# If no embeddings found, make or load in
|
42 |
if embeddings_out.size == 0:
|
43 |
print("Embeddings not found. Loading or generating new ones.")
|
@@ -84,9 +113,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
84 |
embeddings_out = np.round(embeddings_out, 3)
|
85 |
embeddings_out *= 100
|
86 |
|
87 |
-
return embeddings_out
|
88 |
|
89 |
else:
|
90 |
print("Found pre-loaded embeddings.")
|
91 |
|
92 |
-
return embeddings_out
|
|
|
1 |
import time
|
2 |
import numpy as np
|
3 |
import os
|
4 |
+
import spaces
|
5 |
from torch import cuda, backends, version
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from sklearn.pipeline import make_pipeline
|
8 |
+
from sklearn.decomposition import TruncatedSVD
|
9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
|
11 |
# Check for torch cuda
|
12 |
# If you want to disable cuda for testing purposes
|
|
|
23 |
torch_device = "cpu"
|
24 |
high_quality_mode = "No"
|
25 |
|
|
|
26 |
|
27 |
+
@spaces.GPU
|
28 |
+
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1") -> np.ndarray:
|
|
|
29 |
"""
|
30 |
Create or load embeddings for the given documents.
|
31 |
|
|
|
33 |
docs (list): List of documents to embed.
|
34 |
file_list (list): List of file names to check for existing embeddings.
|
35 |
embeddings_out (np.ndarray): Array to store the embeddings.
|
|
|
36 |
embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
|
37 |
high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
|
38 |
|
|
|
40 |
np.ndarray: The generated or loaded embeddings.
|
41 |
"""
|
42 |
|
43 |
+
if high_quality_mode_opt == "Yes":
|
44 |
+
# Define a list of possible local locations to search for the model
|
45 |
+
local_embeddings_locations = [
|
46 |
+
"model/embed/", # Potential local location
|
47 |
+
"/model/embed/", # Potential location in Docker container
|
48 |
+
"/home/user/app/model/embed/" # This is inside a Docker container
|
49 |
+
]
|
50 |
+
|
51 |
+
# Attempt to load the model from each local location
|
52 |
+
for location in local_embeddings_locations:
|
53 |
+
try:
|
54 |
+
embedding_model = SentenceTransformer(location)#, truncate_dim=512)
|
55 |
+
print(f"Found local model installation at: {location}")
|
56 |
+
break # Exit the loop if the model is found
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Failed to load model from {location}: {e}")
|
59 |
+
continue
|
60 |
+
else:
|
61 |
+
# If the loop completes without finding the model in any local location
|
62 |
+
embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
|
63 |
+
print("Could not find local model installation. Downloading from Huggingface")
|
64 |
+
else:
|
65 |
+
embedding_model = make_pipeline(
|
66 |
+
TfidfVectorizer(),
|
67 |
+
TruncatedSVD(100, random_state=random_seed)
|
68 |
+
)
|
69 |
+
|
70 |
# If no embeddings found, make or load in
|
71 |
if embeddings_out.size == 0:
|
72 |
print("Embeddings not found. Loading or generating new ones.")
|
|
|
113 |
embeddings_out = np.round(embeddings_out, 3)
|
114 |
embeddings_out *= 100
|
115 |
|
116 |
+
return embeddings_out, embedding_model
|
117 |
|
118 |
else:
|
119 |
print("Found pre-loaded embeddings.")
|
120 |
|
121 |
+
return embeddings_out, embedding_model
|
funcs/topic_core_funcs.py
CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
|
|
7 |
import numpy as np
|
8 |
import time
|
9 |
from bertopic import BERTopic
|
|
|
10 |
|
11 |
from typing import List, Type, Union
|
12 |
PandasDataFrame = Type[pd.DataFrame]
|
@@ -17,13 +18,7 @@ from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder
|
|
17 |
from funcs.embeddings import make_or_load_embeddings, torch_device
|
18 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
19 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
|
20 |
-
|
21 |
from sklearn.feature_extraction.text import CountVectorizer
|
22 |
-
|
23 |
-
from sentence_transformers import SentenceTransformer
|
24 |
-
from sklearn.pipeline import make_pipeline
|
25 |
-
from sklearn.decomposition import TruncatedSVD
|
26 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
27 |
import funcs.anonymiser as anon
|
28 |
from umap import UMAP
|
29 |
|
@@ -96,84 +91,88 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
96 |
output_list = []
|
97 |
#file_list = [string.name for string in in_files]
|
98 |
|
99 |
-
in_colnames_list_first
|
100 |
|
101 |
-
|
102 |
-
if not "original_index" in data.columns:
|
103 |
-
data = data.reset_index(names="original_index")
|
104 |
|
105 |
-
|
106 |
-
clean_tic = time.perf_counter()
|
107 |
-
print("Starting data clean.")
|
108 |
|
109 |
-
|
|
|
|
|
110 |
|
111 |
-
if
|
112 |
-
|
|
|
113 |
|
114 |
-
|
115 |
-
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
116 |
-
print(clean_time_out)
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
|
126 |
-
|
127 |
-
|
|
|
128 |
|
129 |
-
|
|
|
|
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
134 |
-
data = data[data[in_colnames_list_first].str.len() >= 50]
|
135 |
-
data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
|
136 |
-
|
137 |
-
#print("Data shape after duplicate/null removal: ", data.shape)
|
138 |
|
139 |
-
|
140 |
-
progress(0.4, desc= "Anonymising data")
|
141 |
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
|
148 |
|
149 |
-
|
|
|
150 |
|
151 |
-
|
|
|
|
|
152 |
|
153 |
-
|
154 |
-
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
155 |
|
156 |
-
|
157 |
|
158 |
-
|
159 |
-
|
160 |
|
161 |
-
|
162 |
-
data_file_name_no_ext = data_file_name_no_ext + "_split"
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
-
|
172 |
-
|
173 |
|
174 |
-
|
175 |
|
176 |
-
|
177 |
|
178 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
179 |
data.to_csv(out_data_name)
|
@@ -299,27 +298,6 @@ def extract_topics(
|
|
299 |
if high_quality_mode == "Yes":
|
300 |
print("Using high quality embedding model")
|
301 |
|
302 |
-
# Define a list of possible local locations to search for the model
|
303 |
-
local_embeddings_locations = [
|
304 |
-
"model/embed/", # Potential local location
|
305 |
-
"/model/embed/", # Potential location in Docker container
|
306 |
-
"/home/user/app/model/embed/" # This is inside a Docker container
|
307 |
-
]
|
308 |
-
|
309 |
-
# Attempt to load the model from each local location
|
310 |
-
for location in local_embeddings_locations:
|
311 |
-
try:
|
312 |
-
embedding_model = SentenceTransformer(location)#, truncate_dim=512)
|
313 |
-
print(f"Found local model installation at: {location}")
|
314 |
-
break # Exit the loop if the model is found
|
315 |
-
except Exception as e:
|
316 |
-
print(f"Failed to load model from {location}: {e}")
|
317 |
-
continue
|
318 |
-
else:
|
319 |
-
# If the loop completes without finding the model in any local location
|
320 |
-
embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
|
321 |
-
print("Could not find local model installation. Downloading from Huggingface")
|
322 |
-
|
323 |
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
324 |
|
325 |
# If tfidf embeddings currently exist, wipe these empty
|
@@ -329,15 +307,15 @@ def extract_topics(
|
|
329 |
embeddings_type_state = "large"
|
330 |
|
331 |
# UMAP model uses Bertopic defaults
|
332 |
-
umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
|
333 |
|
334 |
else:
|
335 |
print("Choosing low resource TF-IDF model.")
|
336 |
|
337 |
-
embedding_model = make_pipeline(
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
|
342 |
# If large embeddings currently exist, wipe these empty, then rename embeddings type
|
343 |
if embeddings_type_state == "large":
|
@@ -346,10 +324,10 @@ def extract_topics(
|
|
346 |
embeddings_type_state = "tfidf"
|
347 |
|
348 |
#umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
349 |
-
|
350 |
-
|
351 |
|
352 |
-
embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out,
|
353 |
|
354 |
# If you want to save your embedding files
|
355 |
if return_intermediate_files == "Yes":
|
|
|
7 |
import numpy as np
|
8 |
import time
|
9 |
from bertopic import BERTopic
|
10 |
+
import spaces
|
11 |
|
12 |
from typing import List, Type, Union
|
13 |
PandasDataFrame = Type[pd.DataFrame]
|
|
|
18 |
from funcs.embeddings import make_or_load_embeddings, torch_device
|
19 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
20 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
|
|
|
21 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
|
|
|
|
|
|
|
22 |
import funcs.anonymiser as anon
|
23 |
from umap import UMAP
|
24 |
|
|
|
91 |
output_list = []
|
92 |
#file_list = [string.name for string in in_files]
|
93 |
|
94 |
+
for in_colnames_list_first in in_colnames:
|
95 |
|
96 |
+
print("Cleaning column:", in_colnames_list_first)
|
|
|
|
|
97 |
|
98 |
+
#in_colnames_list_first = in_colnames[0]
|
|
|
|
|
99 |
|
100 |
+
# Reset original index to a new column so you can link it to data outputted from cleaning
|
101 |
+
if not "original_index" in data.columns:
|
102 |
+
data = data.reset_index(names="original_index")
|
103 |
|
104 |
+
if clean_text == "Yes":
|
105 |
+
clean_tic = time.perf_counter()
|
106 |
+
print("Starting data clean.")
|
107 |
|
108 |
+
data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
|
|
|
|
|
109 |
|
110 |
+
if '_clean' not in data_file_name_no_ext:
|
111 |
+
data_file_name_no_ext = data_file_name_no_ext + "_clean"
|
|
|
112 |
|
113 |
+
clean_toc = time.perf_counter()
|
114 |
+
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
115 |
+
print(clean_time_out)
|
116 |
|
117 |
+
# Clean custom regex if exists
|
118 |
+
if not custom_regex.empty:
|
119 |
+
data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
|
120 |
|
121 |
+
if '_clean' not in data_file_name_no_ext:
|
122 |
+
data_file_name_no_ext = data_file_name_no_ext + "_clean"
|
123 |
+
|
124 |
|
125 |
+
if drop_duplicate_text == "Yes":
|
126 |
+
progress(0.3, desc= "Drop duplicates - remove short texts")
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
+
data_file_name_no_ext = data_file_name_no_ext + "_dedup"
|
|
|
129 |
|
130 |
+
#print("Removing duplicates and short entries from data")
|
131 |
+
#print("Data shape before: ", data.shape)
|
132 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
133 |
+
data = data[data[in_colnames_list_first].str.len() >= 50]
|
134 |
+
data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
|
135 |
+
|
136 |
+
#print("Data shape after duplicate/null removal: ", data.shape)
|
137 |
|
138 |
+
if anonymise_drop == "Yes":
|
139 |
+
progress(0.4, desc= "Anonymising data")
|
|
|
140 |
|
141 |
+
if '_anon' not in data_file_name_no_ext:
|
142 |
+
data_file_name_no_ext = data_file_name_no_ext + "_anon"
|
143 |
|
144 |
+
anon_tic = time.perf_counter()
|
145 |
+
|
146 |
+
data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
|
147 |
|
148 |
+
data[in_colnames_list_first] = data_anon_col
|
|
|
149 |
|
150 |
+
print(anonymisation_success)
|
151 |
|
152 |
+
anon_toc = time.perf_counter()
|
153 |
+
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
154 |
|
155 |
+
print(time_out)
|
|
|
156 |
|
157 |
+
if sentence_split_drop == "Yes":
|
158 |
+
progress(0.6, desc= "Splitting text into sentences")
|
159 |
+
|
160 |
+
if '_split' not in data_file_name_no_ext:
|
161 |
+
data_file_name_no_ext = data_file_name_no_ext + "_split"
|
162 |
+
|
163 |
+
anon_tic = time.perf_counter()
|
164 |
+
|
165 |
+
data = expand_sentences_spacy(data, in_colnames_list_first)
|
166 |
+
data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
|
167 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
168 |
+
data.reset_index(inplace=True, drop=True)
|
169 |
|
170 |
+
anon_toc = time.perf_counter()
|
171 |
+
time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
|
172 |
|
173 |
+
print(time_out)
|
174 |
|
175 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
176 |
|
177 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
178 |
data.to_csv(out_data_name)
|
|
|
298 |
if high_quality_mode == "Yes":
|
299 |
print("Using high quality embedding model")
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
302 |
|
303 |
# If tfidf embeddings currently exist, wipe these empty
|
|
|
307 |
embeddings_type_state = "large"
|
308 |
|
309 |
# UMAP model uses Bertopic defaults
|
310 |
+
#umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
|
311 |
|
312 |
else:
|
313 |
print("Choosing low resource TF-IDF model.")
|
314 |
|
315 |
+
# embedding_model = make_pipeline(
|
316 |
+
# TfidfVectorizer(),
|
317 |
+
# TruncatedSVD(100, random_state=random_seed)
|
318 |
+
# )
|
319 |
|
320 |
# If large embeddings currently exist, wipe these empty, then rename embeddings type
|
321 |
if embeddings_type_state == "large":
|
|
|
324 |
embeddings_type_state = "tfidf"
|
325 |
|
326 |
#umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
327 |
+
# UMAP model uses Bertopic defaults
|
328 |
+
umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
|
329 |
|
330 |
+
embeddings_out, embedding_model = make_or_load_embeddings(docs, file_list, embeddings_out, embeddings_super_compress, high_quality_mode, embeddings_name)
|
331 |
|
332 |
# If you want to save your embedding files
|
333 |
if return_intermediate_files == "Yes":
|
requirements.txt
CHANGED
@@ -3,11 +3,10 @@ pandas==2.2.3
|
|
3 |
plotly==5.24.1
|
4 |
scikit-learn==1.5.2
|
5 |
umap-learn==0.5.7
|
6 |
-
gradio==5.
|
7 |
-
boto3==1.35.
|
8 |
transformers==4.46.3
|
9 |
accelerate==1.1.1
|
10 |
-
torch==2.5.1
|
11 |
bertopic==0.16.4
|
12 |
spacy==3.8.0
|
13 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
@@ -18,6 +17,9 @@ presidio_analyzer==2.2.355
|
|
18 |
presidio_anonymizer==2.2.355
|
19 |
scipy
|
20 |
polars
|
21 |
-
sentence-transformers==3.
|
22 |
-
|
23 |
-
#
|
|
|
|
|
|
|
|
3 |
plotly==5.24.1
|
4 |
scikit-learn==1.5.2
|
5 |
umap-learn==0.5.7
|
6 |
+
gradio==5.8.0
|
7 |
+
boto3==1.35.71
|
8 |
transformers==4.46.3
|
9 |
accelerate==1.1.1
|
|
|
10 |
bertopic==0.16.4
|
11 |
spacy==3.8.0
|
12 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
|
|
17 |
presidio_anonymizer==2.2.355
|
18 |
scipy
|
19 |
polars
|
20 |
+
sentence-transformers==3.3.1
|
21 |
+
torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
|
22 |
+
#llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
23 |
+
# Specify exact llama_cpp wheel for huggingface compatibility
|
24 |
+
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
|
25 |
+
numpy==1.26.4
|
requirements_aws.txt
CHANGED
@@ -6,7 +6,7 @@ umap-learn==0.5.7
|
|
6 |
boto3==1.35.64
|
7 |
spacy==3.8.0
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
-
gradio==5.
|
10 |
pyarrow
|
11 |
openpyxl
|
12 |
Faker
|
|
|
6 |
boto3==1.35.64
|
7 |
spacy==3.8.0
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
+
gradio==5.8.0
|
10 |
pyarrow
|
11 |
openpyxl
|
12 |
Faker
|
requirements_gpu.txt
CHANGED
@@ -18,8 +18,7 @@ presidio_analyzer==2.2.355
|
|
18 |
presidio_anonymizer==2.2.355
|
19 |
scipy
|
20 |
polars
|
21 |
-
llama-cpp-python==0.
|
22 |
-
|
23 |
-
|
24 |
-
#numpy==1.26.4
|
25 |
|
|
|
18 |
presidio_anonymizer==2.2.355
|
19 |
scipy
|
20 |
polars
|
21 |
+
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
22 |
+
sentence-transformers==3.3.1
|
23 |
+
numpy==1.26.4
|
|
|
24 |
|