Shivam29rathore commited on
Commit
5298891
·
1 Parent(s): ac3def5

app.py file

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import pickle
3
+ import torch
4
+
5
+
6
+ import io
7
+
8
+
9
+ #contents = pickle.load(f) becomes...
10
+ #contents = CPU_Unpickler(f).load()
11
+
12
+
13
+ model_path = "t5_10K_small_cpu.sav"
14
+
15
+ #load model from drive
16
+ with open(model_path, "rb") as f:
17
+ model= pickle.load(f)
18
+
19
+
20
+ #tokenizer = AutoTokenizer.from_pretrained(checkpoint)
21
+ #model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
22
+
23
+
24
+ import nltk
25
+ from finbert_embedding.embedding import FinbertEmbedding
26
+ import pandas as pd
27
+ from nltk.cluster import KMeansClusterer
28
+ import numpy as np
29
+ import os
30
+ from scipy.spatial import distance_matrix
31
+ from tensorflow.python.lib.io import file_io
32
+ import pickle
33
+
34
+ nltk.download('punkt')
35
+
36
+
37
+ def make_abstractive_summary(word):
38
+ # Instantiate path to store each text Datafile in dataframe
39
+ data_path = "/tmp/"
40
+ if not os.path.exists(data_path):
41
+ os.makedirs(data_path)
42
+ input_ = "/tmp/input.txt"
43
+ # Write file to disk so we can convert each datapoint to a txt file
44
+ with open(input_, "w") as file:
45
+ file.write(word)
46
+ # read the written txt into a variable to start clustering
47
+ with open(input_ , 'r') as f:
48
+ text = f.read()
49
+ # Create tokens from the txt file
50
+ tokens = nltk.sent_tokenize(text)
51
+ # Strip out trailing and leading white spaces from tokens
52
+ sentences = [word.strip() for word in tokens]
53
+ #Create a DataFrame from the tokens
54
+ data = pd.DataFrame(sentences)
55
+ # Assign name Sentences to the column containing text tokens
56
+ data.columns = ['Sentences']
57
+
58
+ # Function to create numerical embeddings for each text tokens in dataframe
59
+ def get_sentence_embeddings():
60
+ # Create empty list for sentence embeddings
61
+ sentence_list = []
62
+ # Loop through all sentences and append sentence embeddings to list
63
+ for i in tokens:
64
+ sentence_embedding = model.sentence_vector(i)
65
+ sentence_list.append(sentence_embedding)
66
+ # Create empty list for ndarray
67
+ sentence_array=[]
68
+ # Loop through sentence list and change data type from tensor to array
69
+ for i in sentence_list:
70
+ sentence_array.append(i.numpy())
71
+ # return sentence embeddings as list
72
+ return sentence_array
73
+
74
+ # Apply get_sentence_embeddings to dataframe to create column Embeddings
75
+ data['Embeddings'] = get_sentence_embeddings()
76
+
77
+ #Number of expected sentences
78
+ NUM_CLUSTERS = 10
79
+ iterations = 8
80
+ # Convert Embeddings into an array and store in variable X
81
+ X = np.array(data['Embeddings'].to_list())
82
+
83
+ #Build k-means cluster algorithm
84
+ Kclusterer = KMeansClusterer(
85
+ NUM_CLUSTERS,
86
+ distance = nltk.cluster.util.cosine_distance,
87
+ repeats = iterations, avoid_empty_clusters = True)
88
+
89
+ # if length of text is too short, K means would return an error
90
+ # use the try except block to return the text as result if it is too short.
91
+ try:
92
+
93
+ assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
94
+
95
+ # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
96
+ data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
97
+ data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
98
+
99
+ # return the text if clustering algorithm catches an exceptiona and move to the next text file
100
+ except ValueError:
101
+ return text
102
+
103
+ # function that computes the distance of each embeddings from the centroid of the cluster
104
+ def distance_from_centroid(row):
105
+ return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
106
+
107
+ # apply distance_from_centroid function to data
108
+ data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
109
+
110
+ ## Return Final Summary
111
+ summary = " ".join(data.sort_values(
112
+ 'Distance_From_Centroid',
113
+ ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
114
+ return ("FinBERT MODEL OUTPUT:--->"+summary," Length of Input:---->"+str(len(word))," Length of Output:----> "+str(len(summary)))
115
+
116
+