Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement AI-Powered Automated Paper Matching #357

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions paper_matching/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from flask import Flask

app = Flask(__name__)

from . import routes
53 changes: 53 additions & 0 deletions paper_matching/api/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from flask import Flask, request, jsonify
from models.profile_analyzer import ProfileAnalyzer
from models.semantic_matcher import SemanticMatcher
from models.recommender import PersonalizedRecommender

app = Flask(__name__)

profile_analyzer = ProfileAnalyzer()
semantic_matcher = SemanticMatcher()
recommender = PersonalizedRecommender()

@app.route('/analyze_profile', methods=['POST'])
def analyze_profile():
data = request.json
profile_text = data.get('profile_text')

if not profile_text:
return jsonify({'error': 'Profile text is required'}), 400

analysis = profile_analyzer.analyze_profile(profile_text)
return jsonify(analysis)

@app.route('/match_papers', methods=['POST'])
def match_papers():
data = request.json
user_profile = data.get('user_profile')
papers = data.get('papers')

if not user_profile or not papers:
return jsonify({'error': 'Both user profile and papers are required'}), 400

# Get embeddings
profile_embedding = semantic_matcher.compute_embeddings([user_profile])[0]
paper_embeddings = semantic_matcher.compute_embeddings(papers)

# Find matches
matches, scores = semantic_matcher.find_matches(profile_embedding, paper_embeddings)

return jsonify({
'matches': [papers[i] for i in matches],
'scores': scores.tolist()
})

@app.route('/get_recommendations', methods=['POST'])
def get_recommendations():
data = request.json
user_id = data.get('user_id')

if not user_id:
return jsonify({'error': 'User ID is required'}), 400

recommendations = recommender.predict(
user_profile)
Empty file.
4 changes: 4 additions & 0 deletions paper_matching/data/processed/processed_papers.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
paper_id,title,abstract,authors,keywords,publication_date,field_of_study,processed_text,processed_title,embeddings
1,"Deep Learning Approaches in Healthcare","This paper reviews...",..."deep learning healthcare ai"..."deep learning approaches healthcare",[0.123, 0.456, ...]
2,"Climate Change Impact Analysis","A comprehensive study...",..."climate change environment analysis"..."climate change impact analysis",[0.234, 0.567, ...]
...
4 changes: 4 additions & 0 deletions paper_matching/data/processed/processed_profiles.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
user_id,name,email,interests,skills,academic_background,research_experience,processed_interests,processed_skills,embeddings
1,"Alex Thompson",...,"ai machine learning healthcare","python tensorflow data analysis",...,[0.111, 0.222, ...]
2,"Maria Garcia",...,"climate science environmental studies","r gis statistical analysis",...,[0.333, 0.444, ...]
...
18 changes: 18 additions & 0 deletions paper_matching/data/raw/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Data directories
data/raw/*
data/processed/*
!data/raw/.gitkeep
!data/processed/.gitkeep

# Python
__pycache__/
*.py[cod]
*$py.class

# Virtual environment
venv/
env/

# IDE
.vscode/
.idea/
Empty file.
6 changes: 6 additions & 0 deletions paper_matching/data/raw/papers.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
paper_id,title,abstract,authors,keywords,publication_date,field_of_study
1,"Deep Learning Approaches in Healthcare","This paper reviews deep learning methods in healthcare applications...","John Smith, Jane Doe","deep learning, healthcare, AI","2023-01-15","Computer Science"
2,"Climate Change Impact Analysis","A comprehensive study of climate change effects on global ecosystems...","Alice Johnson, Bob Wilson","climate change, environment, analysis","2023-02-20","Environmental Science"
3,"Quantum Computing Advances","Recent developments in quantum computing and their implications...","David Brown, Sarah Lee","quantum computing, physics, technology","2023-03-10","Physics"
4,"Machine Learning in Finance","Applications of machine learning algorithms in financial markets...","Mike Chen, Lisa Wang","machine learning, finance, algorithms","2023-04-05","Finance"
5,"Renewable Energy Systems","Analysis of modern renewable energy technologies...","Emma Davis, Tom Miller","renewable energy, sustainability","2023-05-12","Engineering"
6 changes: 6 additions & 0 deletions paper_matching/data/raw/user_profiles.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
user_id,name,email,interests,skills,academic_background,research_experience
1,"Alex Thompson","[email protected]","AI, Machine Learning, Healthcare","Python, TensorFlow, Data Analysis","PhD in Computer Science","5 years in AI research"
2,"Maria Garcia","[email protected]","Climate Science, Environmental Studies","R, GIS, Statistical Analysis","MSc in Environmental Science","3 years in climate research"
3,"James Wilson","[email protected]","Quantum Physics, Computing","Quantum Algorithms, Mathematics, C++","PhD in Physics","7 years in quantum computing"
4,"Sophie Chen","[email protected]","Finance, Machine Learning","Python, Financial Modeling, Deep Learning","MBA, MSc in Data Science","4 years in fintech"
5,"Ryan Peters","[email protected]","Renewable Energy, Sustainability","Engineering Design, Solar Systems","MEng in Energy Systems","6 years in energy sector"
50 changes: 50 additions & 0 deletions paper_matching/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
from preprocessing import DataPreprocessor
from models import ProfileAnalyzer, SemanticMatcher, PersonalizedRecommender
from api import app

def load_data():
# Load your data here (this is just an example)
papers = pd.read_csv('data/raw/papers.csv')
profiles = pd.read_csv('data/raw/user_profiles.csv')
return papers, profiles

def preprocess_data(papers, profiles):
preprocessor = DataPreprocessor()
processed_papers = preprocessor.process_papers(papers)
processed_profiles = preprocessor.process_profiles(profiles)
return processed_papers, processed_profiles

def initialize_models():
profile_analyzer = ProfileAnalyzer()
semantic_matcher = SemanticMatcher()
recommender = PersonalizedRecommender()
return profile_analyzer, semantic_matcher, recommender

def main():
# Load and preprocess data
papers, profiles = load_data()
processed_papers, processed_profiles = preprocess_data(papers, profiles)

# Initialize models
profile_analyzer, semantic_matcher, recommender = initialize_models()

# Compute embeddings for papers and profiles
paper_embeddings = semantic_matcher.compute_embeddings(processed_papers['processed_text'])
profile_embeddings = semantic_matcher.compute_embeddings(processed_profiles['processed_interests'])

# Add embeddings to the dataframes
processed_papers['embeddings'] = paper_embeddings.tolist()
processed_profiles['embeddings'] = profile_embeddings.tolist()

# Save processed data
processed_papers.to_csv('data/processed/processed_papers.csv', index=False)
processed_profiles.to_csv('data/processed/processed_profiles.csv', index=False)

print("Data preprocessing and model initialization complete.")

# Run the Flask app
app.run(debug=True)

if __name__ == '__main__':
main()
3 changes: 3 additions & 0 deletions paper_matching/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .profile_analyzer import ProfileAnalyzer
from .semantic_matcher import SemanticMatcher
from .recommender import PersonalizedRecommender
40 changes: 40 additions & 0 deletions paper_matching/models/profile_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

class ProfileAnalyzer:
def __init__(self):
self.nlp = spacy.load('en_core_web_sm')
self.keyword_extractor = TfidfVectorizer(max_features=100)

def extract_keywords(self, text):
doc = self.nlp(text)
keywords = []

# Extract named entities
for ent in doc.ents:
keywords.append(ent.text)

# Extract noun phrases
for chunk in doc.noun_chunks:
keywords.append(chunk.text)

return list(set(keywords))

def extract_skills(self, text):
doc = self.nlp(text)
skills = []

# Custom skill extraction logic
for token in doc:
if token.pos_ in ['NOUN', 'PROPN']:
skills.append(token.text)

return list(set(skills))

def analyze_profile(self, profile_text):
return {
'keywords': self.extract_keywords(profile_text),
'skills': self.extract_skills(profile_text),
'interests': self.extract_keywords(profile_text) # Can be refined further
}
42 changes: 42 additions & 0 deletions paper_matching/models/recommender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from sklearn.ensemble import RandomForestClassifier
import numpy as np

class PersonalizedRecommender:
def __init__(self):
self.model = RandomForestClassifier()
self.feedback_data = []

def prepare_features(self, user_profile, paper):
# Combine user and paper features
return np.concatenate([
user_profile['embeddings'],
paper['embeddings']
])

def train(self, training_data):
X = []
y = []

for item in training_data:
features = self.prepare_features(item['user'], item['paper'])
X.append(features)
y.append(item['rating'])

self.model.fit(X, y)

def predict(self, user_profile, papers):
predictions = []

for paper in papers:
features = self.prepare_features(user_profile, paper)
score = self.model.predict_proba([features])[0][1]
predictions.append((paper, score))

return sorted(predictions, key=lambda x: x[1], reverse=True)

def update_feedback(self, user_id, paper_id, rating):
self.feedback_data.append({
'user_id': user_id,
'paper_id': paper_id,
'rating': rating
})
25 changes: 25 additions & 0 deletions paper_matching/models/semantic_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class SemanticMatcher:
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_embeddings(self, texts):
return self.model.encode(texts)

def compute_similarity(self, embedding1, embedding2):
return cosine_similarity(
embedding1.reshape(1, -1),
embedding2.reshape(1, -1)
)[0][0]

def find_matches(self, query_embedding, candidate_embeddings, top_k=5):
similarities = cosine_similarity(
query_embedding.reshape(1, -1),
candidate_embeddings
)[0]

top_indices = np.argsort(similarities)[::-1][:top_k]
return top_indices, similarities[top_indices]
1 change: 1 addition & 0 deletions paper_matching/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .data_preprocessor import DataPreprocessor
34 changes: 34 additions & 0 deletions paper_matching/preprocessing/data_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

class DataPreprocessor:
def __init__(self):
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.vectorizer = TfidfVectorizer(max_features=5000)

def clean_text(self, text):
# Tokenize and clean text
tokens = word_tokenize(text.lower())
tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
return ' '.join(tokens)

def process_papers(self, papers_df):
# Process research papers
papers_df['processed_text'] = papers_df['abstract'].apply(self.clean_text)
papers_df['processed_title'] = papers_df['title'].apply(self.clean_text)
return papers_df

def process_profiles(self, profiles_df):
# Process user profiles
profiles_df['processed_interests'] = profiles_df['interests'].apply(self.clean_text)
profiles_df['processed_skills'] = profiles_df['skills'].apply(self.clean_text)
return profiles_df

def vectorize_text(self, text_series):
return self.vectorizer.fit_transform(text_series)
Loading