Awlly commited on
Commit
acd5d65
·
2 Parent(s): 4a76dec 1f670ae

Merge branch 'main' of hf.co:spaces/Awlly/NLP_app

Browse files
README.md CHANGED
@@ -10,3 +10,82 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ ## NLP App Hugging Face's logo
15
+ Hugging Face
16
+ # Streamlit app with computer vision 💡
17
+ Elbrus Bootcamp | Phase-2 | Team Project
18
+
19
+ ## Team🧑🏻‍💻
20
+ 1. [Awlly](https://github.com/Awlly)
21
+ 2. [sakoser](https://github.com/sakoser)
22
+ 3. [whoisida]https://github.com/whoisida
23
+
24
+ ## Task 📌lassifi
25
+ Create a service that classifies movie reviews into good, neutral and bad categories, a service that classifies user input as toxic or non-toxic, as well as a GPT 2 based text generation service that was trained to emulate a certain author’s writing.
26
+
27
+ ## Contents 📝
28
+ 1. Classifies movie reviewsusing LSTM,ruBert,BOW 💨 [Dataset](https://drive.google.com/file/d/1c92sz81bEfOw-rutglKpmKGm6rySmYbt/view?usp=sharing)
29
+ 2. classifies user input as toxic or non-toxi using ruBert-tiny-toxicity 📑 [Dataset](https://drive.google.com/file/d/1O7orH9CrNEhnbnA5KjXji8sgrn6iD5n-/view?usp=drive_link)
30
+ 3. GPT 2 based text generation service
31
+
32
+ ## Deployment 🎈
33
+ The service is implemented on [Hugging Face](https://huggingface.co/spaces/Awlly/NLP_app)
34
+
35
+ ## Libraries 📖
36
+ ```python
37
+ import os
38
+ import unicodedata
39
+ import nltk
40
+ from dataclasses import dataclass
41
+ import joblib
42
+ import numpy as np
43
+ import matplotlib.pyplot as plt
44
+ import torch
45
+ import torch.nn as nn
46
+ import torch.nn.functional as F
47
+ import torch.optim as optim
48
+ from torch.utils.data import DataLoader, TensorDataset
49
+ from torchvision.datasets import ImageFolder
50
+ from torchvision import datasets
51
+ from torchvision import transforms as T
52
+ from torchvision.io import read_image
53
+ from torch.utils.data import Dataset, random_split
54
+ import torchutils as tu
55
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
56
+ from typing import Tuple
57
+ from tqdm import tqdm
58
+ from transformers import AutoModel, AutoTokenizer
59
+ from transformers import AutoModelForSequenceClassification
60
+ import pydensecrf.densecrf as dcrf
61
+ import pydensecrf.utils as dcrf_utils
62
+ from preprocessing import data_preprocessing
63
+ import streamlit as st
64
+ import string
65
+ from sklearn.linear_model import LogisticRegression
66
+ import re
67
+
68
+
69
+
70
+
71
+ from preprocessing import preprocess_single_string
72
+ ```
73
+
74
+
75
+ from preprocessing import data_preprocessing
76
+
77
+
78
+
79
+
80
+ ## Guide 📜
81
+ #### How to run locally?
82
+
83
+ 1. To create a Python virtual environment for running the code, enter:
84
+
85
+ ``python3 -m venv my-env``
86
+
87
+ 2. Activate the new environment:
88
+
89
+ * Windows: ```my-env\Scripts\activate.bat```
90
+ * macOS and Linux: ```source my-env/bin/activate```
91
+
__pycache__/preprocessing.cpython-310.pyc DELETED
Binary file (2.32 kB)
 
app_models/__pycache__/bag_of_words_MODEL.cpython-310.pyc DELETED
Binary file (630 Bytes)
 
app_models/__pycache__/gpt_MODEL.cpython-310.pyc DELETED
Binary file (1.08 kB)
 
app_models/__pycache__/lstm_MODEL.cpython-310.pyc DELETED
Binary file (3.49 kB)
 
app_models/__pycache__/rubert_MODEL.cpython-310.pyc DELETED
Binary file (1.43 kB)
 
app_models/__pycache__/toxicity_MODEL.cpython-310.pyc DELETED
Binary file (985 Bytes)
 
app_models/gpt_MODEL.py CHANGED
@@ -10,7 +10,7 @@ model = GPT2LMHeadModel.from_pretrained(model_path)
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
  model.to(device)
12
 
13
- def generate_text(prompt_text, length, temperature):
14
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
15
  encoded_prompt = encoded_prompt.to(device)
16
 
@@ -22,7 +22,7 @@ def generate_text(prompt_text, length, temperature):
22
  top_p=0.9,
23
  repetition_penalty=1.2,
24
  do_sample=True,
25
- num_return_sequences=1,
26
  )
27
 
28
  # Decode the generated text
 
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
  model.to(device)
12
 
13
+ def generate_text(prompt_text, length, temperature, beams):
14
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
15
  encoded_prompt = encoded_prompt.to(device)
16
 
 
22
  top_p=0.9,
23
  repetition_penalty=1.2,
24
  do_sample=True,
25
+ num_return_sequences=beams,
26
  )
27
 
28
  # Decode the generated text
app_pages/__pycache__/page1_model_comparison.cpython-310.pyc DELETED
Binary file (904 Bytes)
 
app_pages/__pycache__/page2_rubert_toxicity.cpython-310.pyc DELETED
Binary file (794 Bytes)
 
app_pages/__pycache__/page3_gpt_model.cpython-310.pyc DELETED
Binary file (845 Bytes)
 
app_pages/page1_model_comparison.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from app_models.rubert_MODEL import classify_text
3
  from app_models.bag_of_words_MODEL import predict
4
  from app_models.lstm_MODEL import predict_review
 
5
 
6
  class_prefix = 'This review is likely...'
7
 
@@ -11,11 +12,32 @@ def run():
11
 
12
  # Example placeholder for user input
13
  user_input = st.text_area("")
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Placeholder buttons for model selection
16
- if st.button('Classify with BoW/TF-IDF'):
17
- st.write(f'{class_prefix}{predict(user_input)}')
18
- if st.button('Classify with LSTM'):
19
- st.write(f'{class_prefix}{predict_review(user_input)}')
20
- if st.button('Classify with ruBERT'):
21
- st.write(f'{class_prefix}{classify_text(user_input)}')
 
2
  from app_models.rubert_MODEL import classify_text
3
  from app_models.bag_of_words_MODEL import predict
4
  from app_models.lstm_MODEL import predict_review
5
+ import time
6
 
7
  class_prefix = 'This review is likely...'
8
 
 
12
 
13
  # Example placeholder for user input
14
  user_input = st.text_area("")
15
+
16
+
17
+ if st.button('Classify with All Models'):
18
+ # Measure and display Bag of Words/TF-IDF prediction time
19
+ start_time = time.time()
20
+ bow_tfidf_result = predict(user_input)
21
+ end_time = time.time()
22
+ st.write(f'{class_prefix} {bow_tfidf_result} according to Bag of Words/TF-IDF. Time taken: {end_time - start_time:.2f} seconds.')
23
+
24
+ # Measure and display LSTM prediction time
25
+ start_time = time.time()
26
+ lstm_result = predict_review(user_input)
27
+ end_time = time.time()
28
+ st.write(f'{class_prefix} {lstm_result} according to LSTM. Time taken: {end_time - start_time:.2f} seconds.')
29
+
30
+ # Measure and display ruBERT prediction time
31
+ start_time = time.time()
32
+ rubert_result = classify_text(user_input)
33
+ end_time = time.time()
34
+ st.write(f'{class_prefix} {rubert_result} according to ruBERT. Time taken: {end_time - start_time:.2f} seconds.')
35
+
36
+
37
  # Placeholder buttons for model selection
38
+ # if st.button('Classify with BoW/TF-IDF'):
39
+ # st.write(f'{class_prefix}{predict(user_input)}')
40
+ # if st.button('Classify with LSTM'):
41
+ # st.write(f'{class_prefix}{predict_review(user_input)}')
42
+ # if st.button('Classify with ruBERT'):
43
+ # st.write(f'{class_prefix}{classify_text(user_input)}')
app_pages/page3_gpt_model.py CHANGED
@@ -6,9 +6,10 @@ def run():
6
  st.title('GPT Text Generation')
7
  prompt_text = st.text_area("Input Text", "Type here...")
8
  length = st.slider("Length of Generated Text", min_value=50, max_value=500, value=200)
9
- temperature = st.slider("Temperature", min_value=0.1, max_value=1.0, value=0.7, step=0.1)
 
10
 
11
  if st.button('Generate Text'):
12
  with st.spinner('Generating...'):
13
- generated_text = generate_text(prompt_text, length, temperature)
14
  st.text_area("Generated Text", generated_text, height=250)
 
6
  st.title('GPT Text Generation')
7
  prompt_text = st.text_area("Input Text", "Type here...")
8
  length = st.slider("Length of Generated Text", min_value=50, max_value=500, value=200)
9
+ temperature = st.slider("Temperature", min_value=0.1, max_value=2.0, value=0.7, step=0.1)
10
+ beams = st.slider("Number of Generations", min_value=2, max_value=10, value=4, step=1)
11
 
12
  if st.button('Generate Text'):
13
  with st.spinner('Generating...'):
14
+ generated_text = generate_text(prompt_text, length, temperature, beams)
15
  st.text_area("Generated Text", generated_text, height=250)