Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Fix no matching strings found in text
Browse files- .gitignore +1 -0
- app.py +30 -11
- beautiful_soup/beautiful_soup.py +4 -0
.gitignore
CHANGED
@@ -7,3 +7,4 @@ __pycache__
|
|
7 |
/summaries
|
8 |
/.streamlit
|
9 |
/transformer
|
|
|
|
7 |
/summaries
|
8 |
/.streamlit
|
9 |
/transformer
|
10 |
+
/content
|
app.py
CHANGED
@@ -67,7 +67,8 @@ def search_results( query ):
|
|
67 |
return results
|
68 |
|
69 |
def get_summary( url, keywords ):
|
70 |
-
|
|
|
71 |
|
72 |
# Create cache directory if it doesn't exist.
|
73 |
makedirs(dirname(file_path), exist_ok=True)
|
@@ -77,16 +78,34 @@ def get_summary( url, keywords ):
|
|
77 |
with open( file_path, 'r' ) as file:
|
78 |
summary = json.load( file )
|
79 |
else:
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
# Save results to cache file.
|
84 |
with open( file_path, 'w' ) as file:
|
85 |
json.dump( summary, file )
|
86 |
|
87 |
return summary
|
88 |
|
89 |
-
def generate_summary( content, max_length
|
90 |
"""
|
91 |
Generate summary for content.
|
92 |
"""
|
@@ -147,6 +166,9 @@ def filter_sentences_by_keywords( strings, keywords ):
|
|
147 |
if nlp.vocab.strings[match_id] in ["QueryList"]:
|
148 |
sentences.append(sentence.text)
|
149 |
|
|
|
|
|
|
|
150 |
return sentences
|
151 |
|
152 |
def split_content_into_chunks( sentences ):
|
@@ -162,7 +184,6 @@ def split_content_into_chunks( sentences ):
|
|
162 |
sentence_word_count = len(sentence.split(' '))
|
163 |
# If the word count plus the current sentence is larger then 512, start a new chunk.
|
164 |
if word_count + sentence_word_count > 512:
|
165 |
-
st.write("Number of words(tokens): {}".format(word_count))
|
166 |
chunks.append(chunk)
|
167 |
chunk = '' # Reset chunk.
|
168 |
word_count = 0 # Reset word count.
|
@@ -171,7 +192,6 @@ def split_content_into_chunks( sentences ):
|
|
171 |
word_count += sentence_word_count
|
172 |
chunk += sentence + ' '
|
173 |
|
174 |
-
st.write("Number of words(tokens): {}".format(word_count))
|
175 |
chunks.append(chunk)
|
176 |
|
177 |
return chunks
|
@@ -187,13 +207,13 @@ def prep_chunks_summary( strings, keywords ):
|
|
187 |
number_of_chunks = len( chunks )
|
188 |
# Loop through chunks if there are more than one.
|
189 |
if number_of_chunks > 1:
|
190 |
-
# Calculate the max summary length based on the number of chunks.
|
191 |
max_length = int( 512 / number_of_chunks )
|
192 |
-
st.write("Max length: {}".format(max_length))
|
193 |
|
194 |
content = ''
|
195 |
# Loop through chunks and generate summary.
|
196 |
for chunk in chunks:
|
|
|
197 |
chunk_length = len( chunk.split(' ') )
|
198 |
# If chunk is shorter than max length, divide chunk length by 2.
|
199 |
if chunk_length < max_length:
|
@@ -209,7 +229,7 @@ def prep_chunks_summary( strings, keywords ):
|
|
209 |
return content
|
210 |
|
211 |
except Exception as exception:
|
212 |
-
|
213 |
|
214 |
def main():
|
215 |
st.title('Racoon Search')
|
@@ -255,7 +275,6 @@ def main():
|
|
255 |
st.markdown(summary[0]['summary_text'])
|
256 |
except Exception as exception:
|
257 |
exception_notice(exception)
|
258 |
-
return
|
259 |
|
260 |
progress_bar.progress( ( index + 1 ) / number_of_results )
|
261 |
|
|
|
67 |
return results
|
68 |
|
69 |
def get_summary( url, keywords ):
|
70 |
+
url_id = uuid.uuid5( uuid.NAMESPACE_URL, url ).hex
|
71 |
+
file_path = 'summaries/' + url_id + '.json'
|
72 |
|
73 |
# Create cache directory if it doesn't exist.
|
74 |
makedirs(dirname(file_path), exist_ok=True)
|
|
|
78 |
with open( file_path, 'r' ) as file:
|
79 |
summary = json.load( file )
|
80 |
else:
|
81 |
+
try:
|
82 |
+
strings = get_url_content( url )
|
83 |
+
content_cache = 'content/' + url_id + '.txt'
|
84 |
+
|
85 |
+
# Create cache directory if it doesn't exist.
|
86 |
+
makedirs(dirname(content_cache), exist_ok=True)
|
87 |
+
|
88 |
+
# Check if content cache file exists.
|
89 |
+
if exists( content_cache ):
|
90 |
+
with open( content_cache, 'r' ) as file:
|
91 |
+
content = file
|
92 |
+
else:
|
93 |
+
content = prep_chunks_summary( strings, keywords )
|
94 |
+
# Save content to cache file.
|
95 |
+
with open( content_cache, 'w' ) as file:
|
96 |
+
file.write( content )
|
97 |
+
|
98 |
+
# Generate summary from compiled content.
|
99 |
+
summary = generate_summary( content, 200 )
|
100 |
+
except Exception as exception:
|
101 |
+
raise exception
|
102 |
# Save results to cache file.
|
103 |
with open( file_path, 'w' ) as file:
|
104 |
json.dump( summary, file )
|
105 |
|
106 |
return summary
|
107 |
|
108 |
+
def generate_summary( content, max_length ):
|
109 |
"""
|
110 |
Generate summary for content.
|
111 |
"""
|
|
|
166 |
if nlp.vocab.strings[match_id] in ["QueryList"]:
|
167 |
sentences.append(sentence.text)
|
168 |
|
169 |
+
if ( len(sentences) == 0 ):
|
170 |
+
raise Exception('No sentences with keywords found.')
|
171 |
+
|
172 |
return sentences
|
173 |
|
174 |
def split_content_into_chunks( sentences ):
|
|
|
184 |
sentence_word_count = len(sentence.split(' '))
|
185 |
# If the word count plus the current sentence is larger then 512, start a new chunk.
|
186 |
if word_count + sentence_word_count > 512:
|
|
|
187 |
chunks.append(chunk)
|
188 |
chunk = '' # Reset chunk.
|
189 |
word_count = 0 # Reset word count.
|
|
|
192 |
word_count += sentence_word_count
|
193 |
chunk += sentence + ' '
|
194 |
|
|
|
195 |
chunks.append(chunk)
|
196 |
|
197 |
return chunks
|
|
|
207 |
number_of_chunks = len( chunks )
|
208 |
# Loop through chunks if there are more than one.
|
209 |
if number_of_chunks > 1:
|
210 |
+
# Calculate the max summary length based on the number of chunks so that the final combined text is not longer than 512 tokens.
|
211 |
max_length = int( 512 / number_of_chunks )
|
|
|
212 |
|
213 |
content = ''
|
214 |
# Loop through chunks and generate summary.
|
215 |
for chunk in chunks:
|
216 |
+
# Rudementary method to count number of tokens in a chunk.
|
217 |
chunk_length = len( chunk.split(' ') )
|
218 |
# If chunk is shorter than max length, divide chunk length by 2.
|
219 |
if chunk_length < max_length:
|
|
|
229 |
return content
|
230 |
|
231 |
except Exception as exception:
|
232 |
+
raise exception
|
233 |
|
234 |
def main():
|
235 |
st.title('Racoon Search')
|
|
|
275 |
st.markdown(summary[0]['summary_text'])
|
276 |
except Exception as exception:
|
277 |
exception_notice(exception)
|
|
|
278 |
|
279 |
progress_bar.progress( ( index + 1 ) / number_of_results )
|
280 |
|
beautiful_soup/beautiful_soup.py
CHANGED
@@ -168,10 +168,14 @@ def get_tags_text( soup ):
|
|
168 |
for div in tag.find_all(text=True, recursive=False):
|
169 |
found_text = div.get_text( ' ', strip=True )
|
170 |
if found_text != '':
|
|
|
|
|
171 |
text.append( found_text )
|
172 |
else :
|
173 |
found_text = tag.get_text( ' ', strip=True )
|
174 |
if found_text != '':
|
|
|
|
|
175 |
text.append( found_text )
|
176 |
return text
|
177 |
|
|
|
168 |
for div in tag.find_all(text=True, recursive=False):
|
169 |
found_text = div.get_text( ' ', strip=True )
|
170 |
if found_text != '':
|
171 |
+
found_text = found_text.replace( '\n', ' ' )
|
172 |
+
found_text = found_text.replace( '\r', ' ' )
|
173 |
text.append( found_text )
|
174 |
else :
|
175 |
found_text = tag.get_text( ' ', strip=True )
|
176 |
if found_text != '':
|
177 |
+
found_text = found_text.replace( '\n', ' ' )
|
178 |
+
found_text = found_text.replace( '\r', ' ' )
|
179 |
text.append( found_text )
|
180 |
return text
|
181 |
|