Spaces:
Running
Running
File size: 9,232 Bytes
8212cca 0c1d321 8212cca d113ccb 8212cca ad4243a 8212cca d113ccb 8212cca d34dfc3 8212cca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import streamlit as st
import pandas as pd
import textdistance
import re
from collections import Counter
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import subprocess
import sys
# Ensure sentencepiece is installed
try:
import sentencepiece
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
# Set the page configuration as the first Streamlit command
st.set_page_config(page_title="TextTweakAI", layout="wide")
# Load the grammar correction model
@st.cache_resource
def load_grammar_model():
model_name = 'abhinavsarkar/Google-T5-base-Grammatical_Error_Correction-Finetuned-C4-200M-550k'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)
return tokenizer, model, torch_device
tokenizer, model, torch_device = load_grammar_model()
# Load vocabulary for spell checking (optimized loading)
@st.cache_resource
def load_vocabulary():
file_paths = ['book.txt', 'alice_in_wonderland.txt', 'big.txt', 'shakespeare.txt']
words = []
for file_path in file_paths:
with open(file_path, 'r') as f:
file_name_data = f.read().lower()
words += re.findall(r'\w+', file_name_data)
V = set(words)
word_freq = Counter(words)
probs = {k: word_freq[k] / sum(word_freq.values()) for k in word_freq}
return V, word_freq, probs
V, word_freq, probs = load_vocabulary()
# Precompute Jaccard similarity scores for spell check
def precompute_similarities(input_word):
input_word = input_word.lower()
sim = [1 - (textdistance.Jaccard(qval=2).distance(v, input_word)) for v in word_freq.keys()]
return sim
def my_autocorrect(input_paragraph, top_n=5):
input_paragraph = input_paragraph.lower()
words_in_paragraph = re.findall(r'\w+', input_paragraph)
incorrect_words = []
corrected_words = []
for word in words_in_paragraph:
if word not in V:
sim = precompute_similarities(word)
df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
df = df.rename(columns={'index': 'Word', 0: 'Prob'})
df['Similarity'] = sim
output = df.sort_values(['Similarity', 'Prob'], ascending=False).head(top_n)
output = output[['Word', 'Similarity', 'Prob']].reset_index(drop=True)
output.index = output.index + 1
incorrect_words.append(word)
corrected_words.append(output)
return incorrect_words, corrected_words
# Function for grammar correction
def correct_grammar(input_text, num_return_sequences=2):
batch = tokenizer([input_text], truncation=True, padding='max_length', max_length=64, return_tensors="pt").to(torch_device)
translated = model.generate(**batch, max_length=64, num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
return tgt_text
# Streamlit app layout
def main():
st.title("📚TextTweakAI: Your Personalized Spell & Grammar Checker")
st.markdown("""
Welcome to the **TextTweakAI**! This app is designed to help you improve your writing by detecting and correcting spelling and grammar errors. Simply enter a paragraph below and let the app do the rest. Each section provides unique suggestions to refine your text.
""")
paragraph = st.text_area("✨ Enter a paragraph to check for spelling and grammar issues:", height=200)
# Two side-by-side sections
col1, col2 = st.columns(2)
# Initialize session state for storing results
if 'spelling_results' not in st.session_state:
st.session_state.spelling_results = None
if 'grammar_results' not in st.session_state:
st.session_state.grammar_results = None
with col1:
st.header("🔍 Spell Checker")
st.markdown("""
**About the Spell Checker:**
Our spell checker uses a vocabulary from multiple literary texts to detect potential misspellings. It offers suggestions ranked by similarity and probability, helping you to identify and correct errors with ease.
**How to use:**
Enter a paragraph and click **Check Spelling** to see any misspelled words along with suggestions.
""")
if st.button("Check Spelling"):
if paragraph:
with st.spinner("Checking spelling..."):
incorrect_words, corrected_words = my_autocorrect(paragraph)
if incorrect_words:
st.session_state.spelling_results = (incorrect_words, corrected_words)
else:
st.session_state.spelling_results = ("✅ No spelling errors detected!", [])
else:
st.warning("Please enter a paragraph to check for spelling.")
if st.session_state.spelling_results:
incorrect_words, corrected_words = st.session_state.spelling_results
if isinstance(incorrect_words, str):
st.success(incorrect_words)
else:
st.subheader("🔴 Spelling Errors & Suggestions:")
for i, word in enumerate(incorrect_words):
st.write(f"**Misspelled Word**: `{word}`")
with st.expander(f"Suggestions for `{word}`"):
suggestions_df = corrected_words[i]
st.table(suggestions_df[['Word', 'Similarity', 'Prob']])
with col2:
st.header("📝 Grammar Checker")
st.markdown("""
**About the Grammar Checker:**
Powered by a fine-tuned T5 model, our grammar checker analyzes each sentence for potential errors in structure, tense, and word choice. It offers refined suggestions to enhance readability and grammatical accuracy.
**How to use:**
Enter a paragraph and click **Check Grammar** to review each sentence with suggested improvements.
""")
if st.button("Check Grammar"):
if paragraph:
with st.spinner("Checking grammar..."):
sentences = re.split(r'(?<=[.!?]) +', paragraph)
grammar_results = []
for sentence in sentences:
if sentence.strip():
corrected_sentences = correct_grammar(sentence, num_return_sequences=2)
grammar_results.append((sentence, corrected_sentences))
st.session_state.grammar_results = grammar_results
else:
st.warning("Please enter a paragraph to check for grammar.")
if st.session_state.grammar_results:
st.subheader("🔵 Grammar Corrections:")
for sentence, corrected_sentences in st.session_state.grammar_results:
with st.expander(f"**Original Sentence:** {sentence}", expanded=True):
st.write("### Suggestions:")
for corrected_sentence in corrected_sentences:
st.write(f"- {corrected_sentence}")
# Model details section
st.markdown("---")
st.header("📘 Grammar Checker Information")
st.markdown("""
### Grammar Checker Model
The Grammar Checker model, fine-tuned for grammatical error correction (GEC), is ideal for enhancing writing quality across various domains. Below, you'll find relevant resources related to this model's development and usage.
- 🔗 **[Finetuned Model on Hugging Face](https://huggingface.co/abhinavsarkar/Google-T5-base-Grammatical_Error_Correction-Finetuned-C4-200M-550k)**
Access the model details, fine-tuning specifics, and download options on Hugging Face.
- 📊 **[Used Dataset on Hugging Face](https://huggingface.co/datasets/abhinavsarkar/C4-200m-550k-Determiner)**
Explore the pre-processed dataset used to train this model.
- 📂 **[Original Dataset URL](https://www.kaggle.com/datasets/felixstahlberg/the-c4-200m-dataset-for-gec)**
This dataset contains 200 million sentences with diverse structures, hosted on Kaggle.
- 🛠️ **[GitHub Repository](https://github.com/AbhinavSarkarr/Spell-and-Grammer-Checker)**
Access the code repository for dataset preparation, model training, and additional development resources.
""")
# Spell Checker Information
st.markdown("---")
st.header("🔍 Spell Checker Information")
st.markdown("""
### Spell Checker
The Spell Checker leverages a corpus containing multiple text resources to suggest corrections for spelling errors. The algorithm uses **Jaccard Similarity** and **Relative Probability** to identify the closest matches to the input words, ensuring accuracy in suggestions.
- 📂 **[Corpus Resource](https://drive.google.com/drive/u/0/folders/1WsvpWHKUv3OI2mRce-NPg4HsVPyhfk0e)**
The vocabulary for this checker is based on a collection of literary works and publicly available texts.
""")
# Run the app
if __name__ == "__main__":
main()
|