forked from Harshdev098/Research-Nexas
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement AI-Powered Automated Paper Matching
Fixes Harshdev098#341
- Loading branch information
1 parent
fc3cfe0
commit a35a10d
Showing
19 changed files
with
711 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from flask import Flask | ||
|
||
app = Flask(__name__) | ||
|
||
from . import routes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from flask import Flask, request, jsonify | ||
from models.profile_analyzer import ProfileAnalyzer | ||
from models.semantic_matcher import SemanticMatcher | ||
from models.recommender import PersonalizedRecommender | ||
|
||
app = Flask(__name__) | ||
|
||
profile_analyzer = ProfileAnalyzer() | ||
semantic_matcher = SemanticMatcher() | ||
recommender = PersonalizedRecommender() | ||
|
||
@app.route('/analyze_profile', methods=['POST']) | ||
def analyze_profile(): | ||
data = request.json | ||
profile_text = data.get('profile_text') | ||
|
||
if not profile_text: | ||
return jsonify({'error': 'Profile text is required'}), 400 | ||
|
||
analysis = profile_analyzer.analyze_profile(profile_text) | ||
return jsonify(analysis) | ||
|
||
@app.route('/match_papers', methods=['POST']) | ||
def match_papers(): | ||
data = request.json | ||
user_profile = data.get('user_profile') | ||
papers = data.get('papers') | ||
|
||
if not user_profile or not papers: | ||
return jsonify({'error': 'Both user profile and papers are required'}), 400 | ||
|
||
# Get embeddings | ||
profile_embedding = semantic_matcher.compute_embeddings([user_profile])[0] | ||
paper_embeddings = semantic_matcher.compute_embeddings(papers) | ||
|
||
# Find matches | ||
matches, scores = semantic_matcher.find_matches(profile_embedding, paper_embeddings) | ||
|
||
return jsonify({ | ||
'matches': [papers[i] for i in matches], | ||
'scores': scores.tolist() | ||
}) | ||
|
||
@app.route('/get_recommendations', methods=['POST']) | ||
def get_recommendations(): | ||
data = request.json | ||
user_id = data.get('user_id') | ||
|
||
if not user_id: | ||
return jsonify({'error': 'User ID is required'}), 400 | ||
|
||
recommendations = recommender.predict( | ||
user_profile) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
paper_id,title,abstract,authors,keywords,publication_date,field_of_study,processed_text,processed_title,embeddings | ||
1,"Deep Learning Approaches in Healthcare","This paper reviews...",..."deep learning healthcare ai"..."deep learning approaches healthcare",[0.123, 0.456, ...] | ||
2,"Climate Change Impact Analysis","A comprehensive study...",..."climate change environment analysis"..."climate change impact analysis",[0.234, 0.567, ...] | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
user_id,name,email,interests,skills,academic_background,research_experience,processed_interests,processed_skills,embeddings | ||
1,"Alex Thompson",...,"ai machine learning healthcare","python tensorflow data analysis",...,[0.111, 0.222, ...] | ||
2,"Maria Garcia",...,"climate science environmental studies","r gis statistical analysis",...,[0.333, 0.444, ...] | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Data directories | ||
data/raw/* | ||
data/processed/* | ||
!data/raw/.gitkeep | ||
!data/processed/.gitkeep | ||
|
||
# Python | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# Virtual environment | ||
venv/ | ||
env/ | ||
|
||
# IDE | ||
.vscode/ | ||
.idea/ |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
paper_id,title,abstract,authors,keywords,publication_date,field_of_study | ||
1,"Deep Learning Approaches in Healthcare","This paper reviews deep learning methods in healthcare applications...","John Smith, Jane Doe","deep learning, healthcare, AI","2023-01-15","Computer Science" | ||
2,"Climate Change Impact Analysis","A comprehensive study of climate change effects on global ecosystems...","Alice Johnson, Bob Wilson","climate change, environment, analysis","2023-02-20","Environmental Science" | ||
3,"Quantum Computing Advances","Recent developments in quantum computing and their implications...","David Brown, Sarah Lee","quantum computing, physics, technology","2023-03-10","Physics" | ||
4,"Machine Learning in Finance","Applications of machine learning algorithms in financial markets...","Mike Chen, Lisa Wang","machine learning, finance, algorithms","2023-04-05","Finance" | ||
5,"Renewable Energy Systems","Analysis of modern renewable energy technologies...","Emma Davis, Tom Miller","renewable energy, sustainability","2023-05-12","Engineering" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
user_id,name,email,interests,skills,academic_background,research_experience | ||
1,"Alex Thompson","[email protected]","AI, Machine Learning, Healthcare","Python, TensorFlow, Data Analysis","PhD in Computer Science","5 years in AI research" | ||
2,"Maria Garcia","[email protected]","Climate Science, Environmental Studies","R, GIS, Statistical Analysis","MSc in Environmental Science","3 years in climate research" | ||
3,"James Wilson","[email protected]","Quantum Physics, Computing","Quantum Algorithms, Mathematics, C++","PhD in Physics","7 years in quantum computing" | ||
4,"Sophie Chen","[email protected]","Finance, Machine Learning","Python, Financial Modeling, Deep Learning","MBA, MSc in Data Science","4 years in fintech" | ||
5,"Ryan Peters","[email protected]","Renewable Energy, Sustainability","Engineering Design, Solar Systems","MEng in Energy Systems","6 years in energy sector" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pandas as pd | ||
from preprocessing import DataPreprocessor | ||
from models import ProfileAnalyzer, SemanticMatcher, PersonalizedRecommender | ||
from api import app | ||
|
||
def load_data(): | ||
# Load your data here (this is just an example) | ||
papers = pd.read_csv('data/raw/papers.csv') | ||
profiles = pd.read_csv('data/raw/user_profiles.csv') | ||
return papers, profiles | ||
|
||
def preprocess_data(papers, profiles): | ||
preprocessor = DataPreprocessor() | ||
processed_papers = preprocessor.process_papers(papers) | ||
processed_profiles = preprocessor.process_profiles(profiles) | ||
return processed_papers, processed_profiles | ||
|
||
def initialize_models(): | ||
profile_analyzer = ProfileAnalyzer() | ||
semantic_matcher = SemanticMatcher() | ||
recommender = PersonalizedRecommender() | ||
return profile_analyzer, semantic_matcher, recommender | ||
|
||
def main(): | ||
# Load and preprocess data | ||
papers, profiles = load_data() | ||
processed_papers, processed_profiles = preprocess_data(papers, profiles) | ||
|
||
# Initialize models | ||
profile_analyzer, semantic_matcher, recommender = initialize_models() | ||
|
||
# Compute embeddings for papers and profiles | ||
paper_embeddings = semantic_matcher.compute_embeddings(processed_papers['processed_text']) | ||
profile_embeddings = semantic_matcher.compute_embeddings(processed_profiles['processed_interests']) | ||
|
||
# Add embeddings to the dataframes | ||
processed_papers['embeddings'] = paper_embeddings.tolist() | ||
processed_profiles['embeddings'] = profile_embeddings.tolist() | ||
|
||
# Save processed data | ||
processed_papers.to_csv('data/processed/processed_papers.csv', index=False) | ||
processed_profiles.to_csv('data/processed/processed_profiles.csv', index=False) | ||
|
||
print("Data preprocessing and model initialization complete.") | ||
|
||
# Run the Flask app | ||
app.run(debug=True) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .profile_analyzer import ProfileAnalyzer | ||
from .semantic_matcher import SemanticMatcher | ||
from .recommender import PersonalizedRecommender |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import spacy | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from collections import Counter | ||
|
||
class ProfileAnalyzer: | ||
def __init__(self): | ||
self.nlp = spacy.load('en_core_web_sm') | ||
self.keyword_extractor = TfidfVectorizer(max_features=100) | ||
|
||
def extract_keywords(self, text): | ||
doc = self.nlp(text) | ||
keywords = [] | ||
|
||
# Extract named entities | ||
for ent in doc.ents: | ||
keywords.append(ent.text) | ||
|
||
# Extract noun phrases | ||
for chunk in doc.noun_chunks: | ||
keywords.append(chunk.text) | ||
|
||
return list(set(keywords)) | ||
|
||
def extract_skills(self, text): | ||
doc = self.nlp(text) | ||
skills = [] | ||
|
||
# Custom skill extraction logic | ||
for token in doc: | ||
if token.pos_ in ['NOUN', 'PROPN']: | ||
skills.append(token.text) | ||
|
||
return list(set(skills)) | ||
|
||
def analyze_profile(self, profile_text): | ||
return { | ||
'keywords': self.extract_keywords(profile_text), | ||
'skills': self.extract_skills(profile_text), | ||
'interests': self.extract_keywords(profile_text) # Can be refined further | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from sklearn.ensemble import RandomForestClassifier | ||
import numpy as np | ||
|
||
class PersonalizedRecommender: | ||
def __init__(self): | ||
self.model = RandomForestClassifier() | ||
self.feedback_data = [] | ||
|
||
def prepare_features(self, user_profile, paper): | ||
# Combine user and paper features | ||
return np.concatenate([ | ||
user_profile['embeddings'], | ||
paper['embeddings'] | ||
]) | ||
|
||
def train(self, training_data): | ||
X = [] | ||
y = [] | ||
|
||
for item in training_data: | ||
features = self.prepare_features(item['user'], item['paper']) | ||
X.append(features) | ||
y.append(item['rating']) | ||
|
||
self.model.fit(X, y) | ||
|
||
def predict(self, user_profile, papers): | ||
predictions = [] | ||
|
||
for paper in papers: | ||
features = self.prepare_features(user_profile, paper) | ||
score = self.model.predict_proba([features])[0][1] | ||
predictions.append((paper, score)) | ||
|
||
return sorted(predictions, key=lambda x: x[1], reverse=True) | ||
|
||
def update_feedback(self, user_id, paper_id, rating): | ||
self.feedback_data.append({ | ||
'user_id': user_id, | ||
'paper_id': paper_id, | ||
'rating': rating | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from sentence_transformers import SentenceTransformer | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
import numpy as np | ||
|
||
class SemanticMatcher: | ||
def __init__(self): | ||
self.model = SentenceTransformer('all-MiniLM-L6-v2') | ||
|
||
def compute_embeddings(self, texts): | ||
return self.model.encode(texts) | ||
|
||
def compute_similarity(self, embedding1, embedding2): | ||
return cosine_similarity( | ||
embedding1.reshape(1, -1), | ||
embedding2.reshape(1, -1) | ||
)[0][0] | ||
|
||
def find_matches(self, query_embedding, candidate_embeddings, top_k=5): | ||
similarities = cosine_similarity( | ||
query_embedding.reshape(1, -1), | ||
candidate_embeddings | ||
)[0] | ||
|
||
top_indices = np.argsort(similarities)[::-1][:top_k] | ||
return top_indices, similarities[top_indices] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .data_preprocessor import DataPreprocessor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import pandas as pd | ||
import nltk | ||
from nltk.corpus import stopwords | ||
from nltk.tokenize import word_tokenize | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
|
||
class DataPreprocessor: | ||
def __init__(self): | ||
nltk.download('punkt') | ||
nltk.download('stopwords') | ||
nltk.download('wordnet') | ||
self.stop_words = set(stopwords.words('english')) | ||
self.vectorizer = TfidfVectorizer(max_features=5000) | ||
|
||
def clean_text(self, text): | ||
# Tokenize and clean text | ||
tokens = word_tokenize(text.lower()) | ||
tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words] | ||
return ' '.join(tokens) | ||
|
||
def process_papers(self, papers_df): | ||
# Process research papers | ||
papers_df['processed_text'] = papers_df['abstract'].apply(self.clean_text) | ||
papers_df['processed_title'] = papers_df['title'].apply(self.clean_text) | ||
return papers_df | ||
|
||
def process_profiles(self, profiles_df): | ||
# Process user profiles | ||
profiles_df['processed_interests'] = profiles_df['interests'].apply(self.clean_text) | ||
profiles_df['processed_skills'] = profiles_df['skills'].apply(self.clean_text) | ||
return profiles_df | ||
|
||
def vectorize_text(self, text_series): | ||
return self.vectorizer.fit_transform(text_series) |
Oops, something went wrong.