nickmuchi commited on
Commit
17828cf
·
1 Parent(s): 19c6e71

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +28 -16
functions.py CHANGED
@@ -323,29 +323,41 @@ def sentiment_pipe(earnings_text):
323
  return earnings_sentiment, earnings_sentences
324
 
325
  @st.cache_data
326
- def chunk_and_preprocess_text(text,thresh= 450):
327
-
328
- """Chunk text longer than n tokens for summarization"""
329
 
 
330
  sentences = sent_tokenize(clean_text(text))
331
- #sentences = [i.text for i in list(article.sents)]
332
 
333
- current_chunk = 0
 
 
334
  chunks = []
 
335
 
336
  for sentence in sentences:
337
- if len(chunks) == current_chunk + 1:
338
- if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
339
- chunks[current_chunk].extend(sentence.split(" "))
340
- else:
341
- current_chunk += 1
342
- chunks.append(sentence.split(" "))
343
- else:
344
- chunks.append(sentence.split(" "))
345
-
346
- for chunk_id in range(len(chunks)):
347
- chunks[chunk_id] = " ".join(chunks[chunk_id])
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  return chunks
350
 
351
  @st.cache_data
 
323
  return earnings_sentiment, earnings_sentences
324
 
325
  @st.cache_data
326
+ def chunk_and_preprocess_text(text, model_name):
327
+
328
+ '''Chunk and preprocess text for summarization'''
329
 
330
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
331
  sentences = sent_tokenize(clean_text(text))
 
332
 
333
+ # initialize
334
+ length = 0
335
+ chunk = ""
336
  chunks = []
337
+ count = -1
338
 
339
  for sentence in sentences:
340
+ count += 1
341
+ combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
 
 
 
 
 
 
 
 
 
342
 
343
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
344
+ chunk += sentence + " " # add the sentence to the chunk
345
+ length = combined_length # update the length counter
346
+
347
+ # if it is the last sentence
348
+ if count == len(sentences) - 1:
349
+ chunks.append(chunk) # save the chunk
350
+
351
+ else:
352
+ chunks.append(chunk) # save the chunk
353
+ # reset
354
+ length = 0
355
+ chunk = ""
356
+
357
+ # take care of the overflow sentence
358
+ chunk += sentence + " "
359
+ length = len(tokenizer.tokenize(sentence))
360
+
361
  return chunks
362
 
363
  @st.cache_data