Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

nickmuchi commited on May 12, 2023

Commit

17828cf

1 Parent(s): 19c6e71

Update functions.py

Files changed (1) hide show

functions.py CHANGED Viewed

@@ -323,29 +323,41 @@ def sentiment_pipe(earnings_text):
     return earnings_sentiment, earnings_sentences
 @st.cache_data
-def chunk_and_preprocess_text(text,thresh= 450):
-    """Chunk text longer than n tokens for summarization"""
     sentences = sent_tokenize(clean_text(text))
-    #sentences = [i.text for i in list(article.sents)]
-    current_chunk = 0
     chunks = []
     for sentence in sentences:
-        if len(chunks) == current_chunk + 1:
-            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
-                chunks[current_chunk].extend(sentence.split(" "))
-            else:
-                current_chunk += 1
-                chunks.append(sentence.split(" "))
-        else:
-            chunks.append(sentence.split(" "))
-    for chunk_id in range(len(chunks)):
-        chunks[chunk_id] = " ".join(chunks[chunk_id])
     return chunks
 @st.cache_data

     return earnings_sentiment, earnings_sentences
 @st.cache_data
+def chunk_and_preprocess_text(text, model_name):
+    '''Chunk and preprocess text for summarization'''
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     sentences = sent_tokenize(clean_text(text))
+    # initialize
+    length = 0
+    chunk = ""
     chunks = []
+    count = -1
     for sentence in sentences:
+    count += 1
+    combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
+    if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
+      chunk += sentence + " " # add the sentence to the chunk
+      length = combined_length # update the length counter
+      # if it is the last sentence
+      if count == len(sentences) - 1:
+        chunks.append(chunk) # save the chunk
+    else:
+      chunks.append(chunk) # save the chunk
+      # reset
+      length = 0
+      chunk = ""
+      # take care of the overflow sentence
+      chunk += sentence + " "
+      length = len(tokenizer.tokenize(sentence))
     return chunks
 @st.cache_data