Skip to content

Commit

Permalink
Implement AI-Powered Automated Paper Matching
Browse files Browse the repository at this point in the history
  • Loading branch information
sanchitc05 committed Oct 31, 2024
1 parent fc3cfe0 commit a35a10d
Show file tree
Hide file tree
Showing 19 changed files with 711 additions and 0 deletions.
5 changes: 5 additions & 0 deletions paper_matching/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from flask import Flask

app = Flask(__name__)

from . import routes
53 changes: 53 additions & 0 deletions paper_matching/api/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from flask import Flask, request, jsonify
from models.profile_analyzer import ProfileAnalyzer
from models.semantic_matcher import SemanticMatcher
from models.recommender import PersonalizedRecommender

app = Flask(__name__)

profile_analyzer = ProfileAnalyzer()
semantic_matcher = SemanticMatcher()
recommender = PersonalizedRecommender()

@app.route('/analyze_profile', methods=['POST'])
def analyze_profile():
data = request.json
profile_text = data.get('profile_text')

if not profile_text:
return jsonify({'error': 'Profile text is required'}), 400

analysis = profile_analyzer.analyze_profile(profile_text)
return jsonify(analysis)

@app.route('/match_papers', methods=['POST'])
def match_papers():
data = request.json
user_profile = data.get('user_profile')
papers = data.get('papers')

if not user_profile or not papers:
return jsonify({'error': 'Both user profile and papers are required'}), 400

# Get embeddings
profile_embedding = semantic_matcher.compute_embeddings([user_profile])[0]
paper_embeddings = semantic_matcher.compute_embeddings(papers)

# Find matches
matches, scores = semantic_matcher.find_matches(profile_embedding, paper_embeddings)

return jsonify({
'matches': [papers[i] for i in matches],
'scores': scores.tolist()
})

@app.route('/get_recommendations', methods=['POST'])
def get_recommendations():
data = request.json
user_id = data.get('user_id')

if not user_id:
return jsonify({'error': 'User ID is required'}), 400

recommendations = recommender.predict(
user_profile)
Empty file.
4 changes: 4 additions & 0 deletions paper_matching/data/processed/processed_papers.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
paper_id,title,abstract,authors,keywords,publication_date,field_of_study,processed_text,processed_title,embeddings
1,"Deep Learning Approaches in Healthcare","This paper reviews...",..."deep learning healthcare ai"..."deep learning approaches healthcare",[0.123, 0.456, ...]
2,"Climate Change Impact Analysis","A comprehensive study...",..."climate change environment analysis"..."climate change impact analysis",[0.234, 0.567, ...]
...
4 changes: 4 additions & 0 deletions paper_matching/data/processed/processed_profiles.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
user_id,name,email,interests,skills,academic_background,research_experience,processed_interests,processed_skills,embeddings
1,"Alex Thompson",...,"ai machine learning healthcare","python tensorflow data analysis",...,[0.111, 0.222, ...]
2,"Maria Garcia",...,"climate science environmental studies","r gis statistical analysis",...,[0.333, 0.444, ...]
...
18 changes: 18 additions & 0 deletions paper_matching/data/raw/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Data directories
data/raw/*
data/processed/*
!data/raw/.gitkeep
!data/processed/.gitkeep

# Python
__pycache__/
*.py[cod]
*$py.class

# Virtual environment
venv/
env/

# IDE
.vscode/
.idea/
Empty file.
6 changes: 6 additions & 0 deletions paper_matching/data/raw/papers.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
paper_id,title,abstract,authors,keywords,publication_date,field_of_study
1,"Deep Learning Approaches in Healthcare","This paper reviews deep learning methods in healthcare applications...","John Smith, Jane Doe","deep learning, healthcare, AI","2023-01-15","Computer Science"
2,"Climate Change Impact Analysis","A comprehensive study of climate change effects on global ecosystems...","Alice Johnson, Bob Wilson","climate change, environment, analysis","2023-02-20","Environmental Science"
3,"Quantum Computing Advances","Recent developments in quantum computing and their implications...","David Brown, Sarah Lee","quantum computing, physics, technology","2023-03-10","Physics"
4,"Machine Learning in Finance","Applications of machine learning algorithms in financial markets...","Mike Chen, Lisa Wang","machine learning, finance, algorithms","2023-04-05","Finance"
5,"Renewable Energy Systems","Analysis of modern renewable energy technologies...","Emma Davis, Tom Miller","renewable energy, sustainability","2023-05-12","Engineering"
6 changes: 6 additions & 0 deletions paper_matching/data/raw/user_profiles.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
user_id,name,email,interests,skills,academic_background,research_experience
1,"Alex Thompson","[email protected]","AI, Machine Learning, Healthcare","Python, TensorFlow, Data Analysis","PhD in Computer Science","5 years in AI research"
2,"Maria Garcia","[email protected]","Climate Science, Environmental Studies","R, GIS, Statistical Analysis","MSc in Environmental Science","3 years in climate research"
3,"James Wilson","[email protected]","Quantum Physics, Computing","Quantum Algorithms, Mathematics, C++","PhD in Physics","7 years in quantum computing"
4,"Sophie Chen","[email protected]","Finance, Machine Learning","Python, Financial Modeling, Deep Learning","MBA, MSc in Data Science","4 years in fintech"
5,"Ryan Peters","[email protected]","Renewable Energy, Sustainability","Engineering Design, Solar Systems","MEng in Energy Systems","6 years in energy sector"
50 changes: 50 additions & 0 deletions paper_matching/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
from preprocessing import DataPreprocessor
from models import ProfileAnalyzer, SemanticMatcher, PersonalizedRecommender
from api import app

def load_data():
# Load your data here (this is just an example)
papers = pd.read_csv('data/raw/papers.csv')
profiles = pd.read_csv('data/raw/user_profiles.csv')
return papers, profiles

def preprocess_data(papers, profiles):
preprocessor = DataPreprocessor()
processed_papers = preprocessor.process_papers(papers)
processed_profiles = preprocessor.process_profiles(profiles)
return processed_papers, processed_profiles

def initialize_models():
profile_analyzer = ProfileAnalyzer()
semantic_matcher = SemanticMatcher()
recommender = PersonalizedRecommender()
return profile_analyzer, semantic_matcher, recommender

def main():
# Load and preprocess data
papers, profiles = load_data()
processed_papers, processed_profiles = preprocess_data(papers, profiles)

# Initialize models
profile_analyzer, semantic_matcher, recommender = initialize_models()

# Compute embeddings for papers and profiles
paper_embeddings = semantic_matcher.compute_embeddings(processed_papers['processed_text'])
profile_embeddings = semantic_matcher.compute_embeddings(processed_profiles['processed_interests'])

# Add embeddings to the dataframes
processed_papers['embeddings'] = paper_embeddings.tolist()
processed_profiles['embeddings'] = profile_embeddings.tolist()

# Save processed data
processed_papers.to_csv('data/processed/processed_papers.csv', index=False)
processed_profiles.to_csv('data/processed/processed_profiles.csv', index=False)

print("Data preprocessing and model initialization complete.")

# Run the Flask app
app.run(debug=True)

if __name__ == '__main__':
main()
3 changes: 3 additions & 0 deletions paper_matching/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .profile_analyzer import ProfileAnalyzer
from .semantic_matcher import SemanticMatcher
from .recommender import PersonalizedRecommender
40 changes: 40 additions & 0 deletions paper_matching/models/profile_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

class ProfileAnalyzer:
def __init__(self):
self.nlp = spacy.load('en_core_web_sm')
self.keyword_extractor = TfidfVectorizer(max_features=100)

def extract_keywords(self, text):
doc = self.nlp(text)
keywords = []

# Extract named entities
for ent in doc.ents:
keywords.append(ent.text)

# Extract noun phrases
for chunk in doc.noun_chunks:
keywords.append(chunk.text)

return list(set(keywords))

def extract_skills(self, text):
doc = self.nlp(text)
skills = []

# Custom skill extraction logic
for token in doc:
if token.pos_ in ['NOUN', 'PROPN']:
skills.append(token.text)

return list(set(skills))

def analyze_profile(self, profile_text):
return {
'keywords': self.extract_keywords(profile_text),
'skills': self.extract_skills(profile_text),
'interests': self.extract_keywords(profile_text) # Can be refined further
}
42 changes: 42 additions & 0 deletions paper_matching/models/recommender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from sklearn.ensemble import RandomForestClassifier
import numpy as np

class PersonalizedRecommender:
def __init__(self):
self.model = RandomForestClassifier()
self.feedback_data = []

def prepare_features(self, user_profile, paper):
# Combine user and paper features
return np.concatenate([
user_profile['embeddings'],
paper['embeddings']
])

def train(self, training_data):
X = []
y = []

for item in training_data:
features = self.prepare_features(item['user'], item['paper'])
X.append(features)
y.append(item['rating'])

self.model.fit(X, y)

def predict(self, user_profile, papers):
predictions = []

for paper in papers:
features = self.prepare_features(user_profile, paper)
score = self.model.predict_proba([features])[0][1]
predictions.append((paper, score))

return sorted(predictions, key=lambda x: x[1], reverse=True)

def update_feedback(self, user_id, paper_id, rating):
self.feedback_data.append({
'user_id': user_id,
'paper_id': paper_id,
'rating': rating
})
25 changes: 25 additions & 0 deletions paper_matching/models/semantic_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class SemanticMatcher:
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_embeddings(self, texts):
return self.model.encode(texts)

def compute_similarity(self, embedding1, embedding2):
return cosine_similarity(
embedding1.reshape(1, -1),
embedding2.reshape(1, -1)
)[0][0]

def find_matches(self, query_embedding, candidate_embeddings, top_k=5):
similarities = cosine_similarity(
query_embedding.reshape(1, -1),
candidate_embeddings
)[0]

top_indices = np.argsort(similarities)[::-1][:top_k]
return top_indices, similarities[top_indices]
1 change: 1 addition & 0 deletions paper_matching/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .data_preprocessor import DataPreprocessor
34 changes: 34 additions & 0 deletions paper_matching/preprocessing/data_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

class DataPreprocessor:
def __init__(self):
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.vectorizer = TfidfVectorizer(max_features=5000)

def clean_text(self, text):
# Tokenize and clean text
tokens = word_tokenize(text.lower())
tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
return ' '.join(tokens)

def process_papers(self, papers_df):
# Process research papers
papers_df['processed_text'] = papers_df['abstract'].apply(self.clean_text)
papers_df['processed_title'] = papers_df['title'].apply(self.clean_text)
return papers_df

def process_profiles(self, profiles_df):
# Process user profiles
profiles_df['processed_interests'] = profiles_df['interests'].apply(self.clean_text)
profiles_df['processed_skills'] = profiles_df['skills'].apply(self.clean_text)
return profiles_df

def vectorize_text(self, text_series):
return self.vectorizer.fit_transform(text_series)
Loading

0 comments on commit a35a10d

Please sign in to comment.