abhisheks008 · sagar27sahu · Oct 20, 2024 · Oct 20, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/Enhanced_Sentiment_analysis/Model/README.md b/Enhanced_Sentiment_analysis/Model/README.md
@@ -0,0 +1,83 @@
+Sentiment Analysis using BERT and RoBERTa
+🎯 Goal
+The main goal of this project is to develop a sentiment analysis model that can classify text as positive or negative using deep learning techniques. This project leverages BERT and RoBERTa models to enhance the accuracy and robustness of sentiment classification.
+
+🧵 Dataset
+Dataset link: [Provide your dataset link here]
+
+🧾 Description
+This project aims to classify text data into binary sentiment labels (positive/negative). It uses pre-trained transformer models (BERT and RoBERTa) for fine-tuning on a custom dataset. The model's performance is evaluated using accuracy, precision, recall, and F1 score to determine the best approach for sentiment analysis.
+
+🧮 What I Had Done!
+Data Preprocessing:
+
+Cleaned and prepared the text data using NLTK by removing stopwords and non-alphanumeric characters.
+Tokenized and encoded the text using BertTokenizer and RobertaTokenizer for model input.
+Model Training:
+
+Implemented and fine-tuned a BERT model using bert-base-uncased on the preprocessed data.
+Implemented and fine-tuned a RoBERTa model using roberta-base on the same dataset.
+Evaluation:
+
+Evaluated both models using accuracy, precision, recall, and F1 score metrics.
+Compared the performance of BERT and RoBERTa to identify the model that performs better on the given dataset.
+🚀 Models Implemented
+
+BERT (bert-base-uncased): Fine-tuned for sentiment classification.
+
+Chosen for its ability to understand complex language structures and context in text data.
+RoBERTa (roberta-base): Fine-tuned for sentiment classification.
+
+Chosen for its robustness and improved performance over BERT in certain NLP tasks due to optimized training strategies.
+📚 Libraries Needed
+
+pandas
+numpy
+torch
+transformers
+scikit-learn
+nltk
+📈 Performance of the Models based on the Accuracy Scores
+
+BERT Model:
+
+Accuracy: [Add BERT accuracy here]
+Precision: [Add BERT precision here]
+Recall: [Add BERT recall here]
+F1 Score: [Add BERT F1 score here]
+RoBERTa Model:
+
+Accuracy: [Add RoBERTa accuracy here]
+Precision: [Add RoBERTa precision here]
+Recall: [Add RoBERTa recall here]
+F1 Score: [Add RoBERTa F1 score here]
+🛠️ Steps to Run the Project
+
+Clone the repository:
+
+bash
+Copy code
+git clone https://github.com/abhisheks008/DL-Simplified.git
+Install the required packages:
+
+bash
+Copy code
+pip install -r requirements.txt
+Download NLTK data: Run the following in your Python script or interpreter:
+
+python
+Copy code
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt')
+Prepare the dataset: Update the path to your dataset in the code and make sure it has the required columns (text and label).
+
+Run the script:
+
+bash
+Copy code
+python train_and_evaluate.py
+The script will train and evaluate the BERT and RoBERTa models and print out the evaluation metrics.
+
+✒️ Your Signature
+[sagar kumar sahu]
diff --git a/Enhanced_Sentiment_analysis/Model/model.py b/Enhanced_Sentiment_analysis/Model/model.py
@@ -0,0 +1,113 @@
+import pandas as pd
+import numpy as np
+import torch
+from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import nltk
+
+# Download NLTK data
+nltk.download('stopwords')
+nltk.download('punkt')
+
+# Text preprocessing using NLTK
+def preprocess_text(text):
+    stop_words = set(stopwords.words('english'))
+    words = word_tokenize(text.lower())
+    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
+    return ' '.join(filtered_words)
+
+# Load dataset
+data = pd.read_csv('path_to_your_dataset.csv')
+data['text'] = data['text'].apply(preprocess_text)
+
+# Split dataset
+train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)
+
+# Tokenization and Encoding for BERT
+bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+train_encodings = bert_tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
+val_encodings = bert_tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
+
+# Tokenization and Encoding for RoBERTa
+roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+train_encodings_roberta = roberta_tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
+val_encodings_roberta = roberta_tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
+
+# Convert to PyTorch dataset
+class SentimentDataset(torch.utils.data.Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels.iloc[idx])
+        return item
+
+    def __len__(self):
+        return len(self.labels)
+
+train_dataset_bert = SentimentDataset(train_encodings, train_labels)
+val_dataset_bert = SentimentDataset(val_encodings, val_labels)
+train_dataset_roberta = SentimentDataset(train_encodings_roberta, train_labels)
+val_dataset_roberta = SentimentDataset(val_encodings_roberta, val_labels)
+
+# Model definition for BERT
+model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
+
+# Model definition for RoBERTa
+model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
+
+# Training Arguments
+training_args = TrainingArguments(
+    output_dir='./results',
+    evaluation_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=1,
+    weight_decay=0.01,
+)
+
+# Evaluation Function
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = pred.predictions.argmax(-1)
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
+    acc = accuracy_score(labels, preds)
+    return {
+        'accuracy': acc,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }
+
+# Training BERT Model
+trainer_bert = Trainer(
+    model=model_bert,
+    args=training_args,
+    train_dataset=train_dataset_bert,
+    eval_dataset=val_dataset_bert,
+    compute_metrics=compute_metrics,
+)
+trainer_bert.train()
+
+# Training RoBERTa Model
+trainer_roberta = Trainer(
+    model=model_roberta,
+    args=training_args,
+    train_dataset=train_dataset_roberta,
+    eval_dataset=val_dataset_roberta,
+    compute_metrics=compute_metrics,
+)
+trainer_roberta.train()
+
+# Evaluate models
+bert_eval = trainer_bert.evaluate()
+roberta_eval = trainer_roberta.evaluate()
+
+print("BERT Evaluation:", bert_eval)
+print("RoBERTa Evaluation:", roberta_eval)
diff --git a/Enhanced_Sentiment_analysis/requirement.txt b/Enhanced_Sentiment_analysis/requirement.txt
@@ -0,0 +1,6 @@
+pandas==1.5.3
+numpy==1.23.5
+torch==1.13.1
+transformers==4.28.1
+scikit-learn==1.2.2
+nltk==3.7
diff --git a/Sentiment Analysis Model/.idea/Sentiment Analysis Model.iml b/Sentiment Analysis Model/.idea/Sentiment Analysis Model.iml
diff --git a/Sentiment Analysis Model/.idea/inspectionProfiles/profiles_settings.xml b/Sentiment Analysis Model/.idea/inspectionProfiles/profiles_settings.xml