From 4892471ba16c34fec4928135fa3ea999b87a631a Mon Sep 17 00:00:00 2001 From: Ankan Date: Sun, 3 Nov 2024 16:23:32 +0000 Subject: [PATCH] Added the GrammarAutoCorrectorModel --- .../Grammar Auto Corrector Model/README.md | 42 +++++++++++ .../Grammar Auto Corrector Model/main.py | 74 +++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 Machine_Learning/Grammar Auto Corrector Model/README.md create mode 100644 Machine_Learning/Grammar Auto Corrector Model/main.py diff --git a/Machine_Learning/Grammar Auto Corrector Model/README.md b/Machine_Learning/Grammar Auto Corrector Model/README.md new file mode 100644 index 0000000000..84d997119e --- /dev/null +++ b/Machine_Learning/Grammar Auto Corrector Model/README.md @@ -0,0 +1,42 @@ +# Grammar AutoCorrector + +A powerful Grammar AutoCorrector tool designed to automatically detect and correct grammatical errors in English sentences. This project leverages NLP techniques and the T5 transformer model for advanced grammar correction, making it suitable for applications in writing assistance tools, educational platforms, and beyond. + +## Features +- Preprocessing techniques such as tokenization, lemmatization, stop word removal, and punctuation removal. +- Training of a grammar correction model using large datasets with grammatically correct sentences. +- Ability to identify and correct common grammatical errors in sentences. + + + +## This project uses the following modules: + +### Modules Used + +1. Transformers +2. Torch +3. NLTK +4. SpaCy +5. Pandas +6. NumPy +7. re (Regular Expressions) +8. Scikit-Learn +9. pytest +10. datasets (Hugging Face) +11. yaml +12. tqdm + + +## Data +Download appropriate grammar correction datasets, such as: +- [Cambridge English Write & Improve + LOCNESS](https://ilexir.co.uk/datasets/index.html) +- [Grammarly GEC Dataset](https://www.grammarly.com/research/grammatical-error-correction/) +- [JFLEG](https://github.com/keisks/jfleg) + + + +# Connect with Me + +- **GitHub**: [Peart-Guy](https://github.com/Peart-Guy) +- **LinkedIn**: [Ankan Mukhopadhyay](https://www.linkedin.com/in/ankan-mukhopadhyaypeartguy/) + diff --git a/Machine_Learning/Grammar Auto Corrector Model/main.py b/Machine_Learning/Grammar Auto Corrector Model/main.py new file mode 100644 index 0000000000..2286ad8e9b --- /dev/null +++ b/Machine_Learning/Grammar Auto Corrector Model/main.py @@ -0,0 +1,74 @@ +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer +import re +from transformers import T5ForConditionalGeneration, T5Tokenizer +from transformers import Trainer, TrainingArguments +from datasets import load_dataset + +# Download NLTK resources +nltk.download('punkt') +nltk.download('stopwords') +nltk.download('wordnet') +nltk.download('averaged_perceptron_tagger') + +# Initialize the lemmatizer and stop words list +lemmatizer = WordNetLemmatizer() +stop_words = set(stopwords.words('english')) + +def preprocess_text(text): + # Lowercase the text + text = text.lower() + # Remove punctuation + text = re.sub(r'[^\w\s]', '', text) + # Tokenize text + tokens = word_tokenize(text) + # Lemmatize and remove stop words + tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] + return tokens + + +# Load pre-trained T5 model and tokenizer +model = T5ForConditionalGeneration.from_pretrained('t5-small') +tokenizer = T5Tokenizer.from_pretrained('t5-small') + +dataset = load_dataset("bookcorpus", split="train") # For BooksCorpus +wiki_dataset = load_dataset("wikipedia", "20220301.en", split="train") # For Wikipedia + +# Define a training function +def train_model(dataset): + # Tokenize inputs and outputs + inputs = tokenizer(["correct: " + text for text in dataset["input_texts"]], return_tensors="pt", padding=True) + outputs = tokenizer(["grammar_corrected: " + text for text in dataset["output_texts"]], return_tensors="pt", padding=True) + + # Define Trainer + training_args = TrainingArguments( + output_dir='./results', + per_device_train_batch_size=4, + num_train_epochs=3, + weight_decay=0.01, + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset + ) + + trainer.train() + +# Train the model on the processed dataset +train_model(dataset) + + + +def correct_grammar(text): + input_text = "correct: " + text + input_ids = tokenizer(input_text, return_tensors="pt").input_ids + outputs = model.generate(input_ids) + corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + return corrected_text + +# Example usage +test_sentence = "She go to the market every morning." +print("Corrected Sentence:", correct_grammar(test_sentence))