AleksBlacky commited on
Commit
3722795
Β·
1 Parent(s): 8cf1f84

change ui, added readable topics names

Browse files
__pycache__/model.cpython-39.pyc CHANGED
Binary files a/__pycache__/model.cpython-39.pyc and b/__pycache__/model.cpython-39.pyc differ
 
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  from pandas import DataFrame
3
  import seaborn as sns
4
- from model import ArxivClassifierModel, ArxivClassifierModelsPipeline
5
 
6
  st.markdown("# Hello, friend!")
7
  st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
@@ -15,12 +15,12 @@ with st.form(key="my_form"):
15
 
16
  with c2:
17
  doc_title = st.text_area(
18
- "Paste your abstract title below (max 100 words)",
19
  height=210,
20
  )
21
 
22
  doc_abstract = st.text_area(
23
- "Paste your abstract text below (max 100500 words)",
24
  height=410,
25
  )
26
 
@@ -44,7 +44,7 @@ with st.form(key="my_form"):
44
  "⚠️ Your abstract contains "
45
  + str(len_abstract)
46
  + " words."
47
- + " Only the first 50 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
48
  )
49
 
50
  doc_abstract = doc_abstract[:MAX_WORDS_ABSTRACT]
@@ -68,18 +68,12 @@ st.markdown("## 🎈 Yor article probably about: ")
68
  st.header("")
69
 
70
  df = (
71
- DataFrame(preds_topic.items(), columns=["Topic", "Prob"])
72
- .sort_values(by="Prob", ascending=False)
73
  .reset_index(drop=True)
74
  )
75
  df.index += 1
76
 
77
- df2 = (
78
- DataFrame(preds_maintopic.items(), columns=["Topic", "Prob"])
79
- .sort_values(by="Prob", ascending=False)
80
- .reset_index(drop=True)
81
- )
82
- df2.index += 1
83
 
84
  # Add styling
85
  cmGreen = sns.light_palette("green", as_cmap=True)
@@ -87,27 +81,21 @@ cmRed = sns.light_palette("red", as_cmap=True)
87
  df = df.style.background_gradient(
88
  cmap=cmGreen,
89
  subset=[
90
- "Prob",
91
- ],
92
- )
93
- df2 = df2.style.background_gradient(
94
- cmap=cmGreen,
95
- subset=[
96
- "Prob",
97
  ],
98
  )
99
 
100
  c1, c2, c3 = st.columns([1, 3, 1])
101
 
102
  format_dictionary = {
103
- "Prob": "{:.1%}",
104
  }
105
 
106
  df = df.format(format_dictionary)
107
- df2 = df2.format(format_dictionary)
108
 
109
  with c2:
110
  st.markdown("#### We suppose your research about: ")
111
- st.table(df2)
 
112
  st.markdown("##### More detailed, it's about topic: ")
113
  st.table(df)
 
1
  import streamlit as st
2
  from pandas import DataFrame
3
  import seaborn as sns
4
+ from model import ArxivClassifierModelsPipeline
5
 
6
  st.markdown("# Hello, friend!")
7
  st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
 
15
 
16
  with c2:
17
  doc_title = st.text_area(
18
+ "Paste your paper's title below (max 100 words)",
19
  height=210,
20
  )
21
 
22
  doc_abstract = st.text_area(
23
+ "Paste your paper's abstract text below (max 100500 words)",
24
  height=410,
25
  )
26
 
 
44
  "⚠️ Your abstract contains "
45
  + str(len_abstract)
46
  + " words."
47
+ + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
48
  )
49
 
50
  doc_abstract = doc_abstract[:MAX_WORDS_ABSTRACT]
 
68
  st.header("")
69
 
70
  df = (
71
+ DataFrame(preds_topic.items(), columns=["Topic", "Probability"])
72
+ .sort_values(by="Probability", ascending=False)
73
  .reset_index(drop=True)
74
  )
75
  df.index += 1
76
 
 
 
 
 
 
 
77
 
78
  # Add styling
79
  cmGreen = sns.light_palette("green", as_cmap=True)
 
81
  df = df.style.background_gradient(
82
  cmap=cmGreen,
83
  subset=[
84
+ "Probability",
 
 
 
 
 
 
85
  ],
86
  )
87
 
88
  c1, c2, c3 = st.columns([1, 3, 1])
89
 
90
  format_dictionary = {
91
+ "Probability": "{:.1%}",
92
  }
93
 
94
  df = df.format(format_dictionary)
 
95
 
96
  with c2:
97
  st.markdown("#### We suppose your research about: ")
98
+ st.markdown(f"### {preds_maintopic}! ")
99
+ st.markdown(f"Wow, we're impressed, are you addicted to {preds_maintopic.lower()}?! Coool! ")
100
  st.markdown("##### More detailed, it's about topic: ")
101
  st.table(df)
model.py CHANGED
@@ -2,36 +2,6 @@ import streamlit as st
2
  import pickle
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
 
5
- class ArxivClassifierModel():
6
-
7
- def __init__(self):
8
- self.model = self.__load_model()
9
-
10
- model_name_global = "allenai/scibert_scivocab_uncased"
11
- self.tokenizer = AutoTokenizer.from_pretrained(model_name_global)
12
- with open('./models/scibert/decode_dict.pkl', 'rb') as f:
13
- self.decode_dict = pickle.load(f)
14
-
15
- def make_predict(self, text):
16
- # tokenizer_ = AutoTokenizer.from_pretrained(model_name_global)
17
- tokens = self.tokenizer(text, return_tensors="pt")
18
-
19
- outs = self.model(tokens.input_ids)
20
-
21
- probs = outs["logits"].softmax(dim=-1).tolist()[0]
22
- topic_probs = {}
23
- for i, p in enumerate(probs):
24
- if p > 0.1:
25
- topic_probs[self.decode_dict[i]] = p
26
- return topic_probs
27
-
28
- # allow_output_mutation=True
29
- @st.cache(suppress_st_warning=True)
30
- def __load_model(self):
31
- st.write("Loading big model")
32
- return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
33
-
34
-
35
 
36
  class ArxivClassifierModelsPipeline():
37
 
@@ -51,6 +21,12 @@ class ArxivClassifierModelsPipeline():
51
  with open('models/maintopic_clf/decode_dict_maintopic.pkl', 'rb') as f:
52
  self.decode_dict_maintopic = pickle.load(f)
53
 
 
 
 
 
 
 
54
  def make_predict(self, text):
55
  tokens_topic = self.topic_tokenizer(text, return_tensors="pt")
56
  topic_outs = self.model_topic_clf(tokens_topic.input_ids)
@@ -58,19 +34,17 @@ class ArxivClassifierModelsPipeline():
58
  topic_probs = {}
59
  for i, p in enumerate(probs_topic):
60
  if p > 0.1:
61
- topic_probs[self.decode_dict_topic[i]] = p
 
 
 
62
 
63
  tokens_maintopic = self.maintopic_tokenizer(text, return_tensors="pt")
64
  maintopic_outs = self.model_maintopic_clf(tokens_maintopic.input_ids)
65
  probs_maintopic = maintopic_outs["logits"].softmax(dim=-1).tolist()[0]
66
- maintopic_probs = {}
67
- for i, p in enumerate(probs_maintopic):
68
- if p > 0.1:
69
- maintopic_probs[self.decode_dict_maintopic[i]] = p
70
 
71
-
72
-
73
- return topic_probs, maintopic_probs
74
 
75
  @st.cache(suppress_st_warning=True)
76
  def __load_topic_clf(self):
 
2
  import pickle
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  class ArxivClassifierModelsPipeline():
7
 
 
21
  with open('models/maintopic_clf/decode_dict_maintopic.pkl', 'rb') as f:
22
  self.decode_dict_maintopic = pickle.load(f)
23
 
24
+ with open('models/maintopic_clf/main_topic_dict.pkl', 'rb') as f:
25
+ self.main_topic_dict = pickle.load(f)
26
+
27
+ with open('models/scibert/topic_dict.pkl', 'rb') as f:
28
+ self.topic_dict = pickle.load(f)
29
+
30
  def make_predict(self, text):
31
  tokens_topic = self.topic_tokenizer(text, return_tensors="pt")
32
  topic_outs = self.model_topic_clf(tokens_topic.input_ids)
 
34
  topic_probs = {}
35
  for i, p in enumerate(probs_topic):
36
  if p > 0.1:
37
+ if self.decode_dict_topic[i] in self.topic_dict:
38
+ topic_probs[self.topic_dict[self.decode_dict_topic[i]]] = p
39
+ else:
40
+ topic_probs[self.decode_dict_topic[i]] = p
41
 
42
  tokens_maintopic = self.maintopic_tokenizer(text, return_tensors="pt")
43
  maintopic_outs = self.model_maintopic_clf(tokens_maintopic.input_ids)
44
  probs_maintopic = maintopic_outs["logits"].softmax(dim=-1).tolist()[0]
45
+ maintopic_probs = self.decode_dict_maintopic[0]
 
 
 
46
 
47
+ return topic_probs, self.main_topic_dict[maintopic_probs]
 
 
48
 
49
  @st.cache(suppress_st_warning=True)
50
  def __load_topic_clf(self):
models/maintopic_clf/main_topic_dict.pkl ADDED
Binary file (663 Bytes). View file
 
models/scibert/topic_dict.pkl ADDED
Binary file (1.8 kB). View file