Implement AI-Powered Automated Paper Matching

Fixes Harshdev098#341
sanchitc05 · Oct 31, 2024 · a35a10d · a35a10d
1 parent fc3cfe0
commit a35a10d
Show file tree

Hide file tree

Showing 19 changed files with 711 additions and 0 deletions.
diff --git a/paper_matching/api/__init__.py b/paper_matching/api/__init__.py
@@ -0,0 +1,5 @@
+from flask import Flask
+
+app = Flask(__name__)
+
+from . import routes
diff --git a/paper_matching/api/routes.py b/paper_matching/api/routes.py
@@ -0,0 +1,53 @@
+from flask import Flask, request, jsonify
+from models.profile_analyzer import ProfileAnalyzer
+from models.semantic_matcher import SemanticMatcher
+from models.recommender import PersonalizedRecommender
+
+app = Flask(__name__)
+
+profile_analyzer = ProfileAnalyzer()
+semantic_matcher = SemanticMatcher()
+recommender = PersonalizedRecommender()
+
+@app.route('/analyze_profile', methods=['POST'])
+def analyze_profile():
+    data = request.json
+    profile_text = data.get('profile_text')
+
+    if not profile_text:
+        return jsonify({'error': 'Profile text is required'}), 400
+
+    analysis = profile_analyzer.analyze_profile(profile_text)
+    return jsonify(analysis)
+
+@app.route('/match_papers', methods=['POST'])
+def match_papers():
+    data = request.json
+    user_profile = data.get('user_profile')
+    papers = data.get('papers')
+
+    if not user_profile or not papers:
+        return jsonify({'error': 'Both user profile and papers are required'}), 400
+
+    # Get embeddings
+    profile_embedding = semantic_matcher.compute_embeddings([user_profile])[0]
+    paper_embeddings = semantic_matcher.compute_embeddings(papers)
+
+    # Find matches
+    matches, scores = semantic_matcher.find_matches(profile_embedding, paper_embeddings)
+
+    return jsonify({
+        'matches': [papers[i] for i in matches],
+        'scores': scores.tolist()
+    })
+
+@app.route('/get_recommendations', methods=['POST'])
+def get_recommendations():
+    data = request.json
+    user_id = data.get('user_id')
+
+    if not user_id:
+        return jsonify({'error': 'User ID is required'}), 400
+
+    recommendations = recommender.predict(
+        user_profile)
diff --git a/paper_matching/data/processed/.gitkeep b/paper_matching/data/processed/.gitkeep
diff --git a/paper_matching/data/processed/processed_papers.csv b/paper_matching/data/processed/processed_papers.csv
@@ -0,0 +1,4 @@
+paper_id,title,abstract,authors,keywords,publication_date,field_of_study,processed_text,processed_title,embeddings
+1,"Deep Learning Approaches in Healthcare","This paper reviews...",..."deep learning healthcare ai"..."deep learning approaches healthcare",[0.123, 0.456, ...]
+2,"Climate Change Impact Analysis","A comprehensive study...",..."climate change environment analysis"..."climate change impact analysis",[0.234, 0.567, ...]
+...
diff --git a/paper_matching/data/processed/processed_profiles.csv b/paper_matching/data/processed/processed_profiles.csv
@@ -0,0 +1,4 @@
+user_id,name,email,interests,skills,academic_background,research_experience,processed_interests,processed_skills,embeddings
+1,"Alex Thompson",...,"ai machine learning healthcare","python tensorflow data analysis",...,[0.111, 0.222, ...]
+2,"Maria Garcia",...,"climate science environmental studies","r gis statistical analysis",...,[0.333, 0.444, ...]
+...
diff --git a/paper_matching/data/raw/.gitignore b/paper_matching/data/raw/.gitignore
@@ -0,0 +1,18 @@
+# Data directories
+data/raw/*
+data/processed/*
+!data/raw/.gitkeep
+!data/processed/.gitkeep
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Virtual environment
+venv/
+env/
+
+# IDE
+.vscode/
+.idea/
diff --git a/paper_matching/data/raw/.gitkeep b/paper_matching/data/raw/.gitkeep
diff --git a/paper_matching/data/raw/papers.csv b/paper_matching/data/raw/papers.csv
@@ -0,0 +1,6 @@
+paper_id,title,abstract,authors,keywords,publication_date,field_of_study
+1,"Deep Learning Approaches in Healthcare","This paper reviews deep learning methods in healthcare applications...","John Smith, Jane Doe","deep learning, healthcare, AI","2023-01-15","Computer Science"
+2,"Climate Change Impact Analysis","A comprehensive study of climate change effects on global ecosystems...","Alice Johnson, Bob Wilson","climate change, environment, analysis","2023-02-20","Environmental Science"
+3,"Quantum Computing Advances","Recent developments in quantum computing and their implications...","David Brown, Sarah Lee","quantum computing, physics, technology","2023-03-10","Physics"
+4,"Machine Learning in Finance","Applications of machine learning algorithms in financial markets...","Mike Chen, Lisa Wang","machine learning, finance, algorithms","2023-04-05","Finance"
+5,"Renewable Energy Systems","Analysis of modern renewable energy technologies...","Emma Davis, Tom Miller","renewable energy, sustainability","2023-05-12","Engineering"
diff --git a/paper_matching/data/raw/user_profiles.csv b/paper_matching/data/raw/user_profiles.csv
@@ -0,0 +1,6 @@
+user_id,name,email,interests,skills,academic_background,research_experience
+1,"Alex Thompson","[email protected]","AI, Machine Learning, Healthcare","Python, TensorFlow, Data Analysis","PhD in Computer Science","5 years in AI research"
+2,"Maria Garcia","[email protected]","Climate Science, Environmental Studies","R, GIS, Statistical Analysis","MSc in Environmental Science","3 years in climate research"
+3,"James Wilson","[email protected]","Quantum Physics, Computing","Quantum Algorithms, Mathematics, C++","PhD in Physics","7 years in quantum computing"
+4,"Sophie Chen","[email protected]","Finance, Machine Learning","Python, Financial Modeling, Deep Learning","MBA, MSc in Data Science","4 years in fintech"
+5,"Ryan Peters","[email protected]","Renewable Energy, Sustainability","Engineering Design, Solar Systems","MEng in Energy Systems","6 years in energy sector"
diff --git a/paper_matching/main.py b/paper_matching/main.py
@@ -0,0 +1,50 @@
+import pandas as pd
+from preprocessing import DataPreprocessor
+from models import ProfileAnalyzer, SemanticMatcher, PersonalizedRecommender
+from api import app
+
+def load_data():
+    # Load your data here (this is just an example)
+    papers = pd.read_csv('data/raw/papers.csv')
+    profiles = pd.read_csv('data/raw/user_profiles.csv')
+    return papers, profiles
+
+def preprocess_data(papers, profiles):
+    preprocessor = DataPreprocessor()
+    processed_papers = preprocessor.process_papers(papers)
+    processed_profiles = preprocessor.process_profiles(profiles)
+    return processed_papers, processed_profiles
+
+def initialize_models():
+    profile_analyzer = ProfileAnalyzer()
+    semantic_matcher = SemanticMatcher()
+    recommender = PersonalizedRecommender()
+    return profile_analyzer, semantic_matcher, recommender
+
+def main():
+    # Load and preprocess data
+    papers, profiles = load_data()
+    processed_papers, processed_profiles = preprocess_data(papers, profiles)
+
+    # Initialize models
+    profile_analyzer, semantic_matcher, recommender = initialize_models()
+
+    # Compute embeddings for papers and profiles
+    paper_embeddings = semantic_matcher.compute_embeddings(processed_papers['processed_text'])
+    profile_embeddings = semantic_matcher.compute_embeddings(processed_profiles['processed_interests'])
+
+    # Add embeddings to the dataframes
+    processed_papers['embeddings'] = paper_embeddings.tolist()
+    processed_profiles['embeddings'] = profile_embeddings.tolist()
+
+    # Save processed data
+    processed_papers.to_csv('data/processed/processed_papers.csv', index=False)
+    processed_profiles.to_csv('data/processed/processed_profiles.csv', index=False)
+
+    print("Data preprocessing and model initialization complete.")
+
+    # Run the Flask app
+    app.run(debug=True)
+
+if __name__ == '__main__':
+    main()
diff --git a/paper_matching/models/__init__.py b/paper_matching/models/__init__.py
@@ -0,0 +1,3 @@
+from .profile_analyzer import ProfileAnalyzer
+from .semantic_matcher import SemanticMatcher
+from .recommender import PersonalizedRecommender
diff --git a/paper_matching/models/profile_analyzer.py b/paper_matching/models/profile_analyzer.py
@@ -0,0 +1,40 @@
+import spacy
+from sklearn.feature_extraction.text import TfidfVectorizer
+from collections import Counter
+
+class ProfileAnalyzer:
+    def __init__(self):
+        self.nlp = spacy.load('en_core_web_sm')
+        self.keyword_extractor = TfidfVectorizer(max_features=100)
+
+    def extract_keywords(self, text):
+        doc = self.nlp(text)
+        keywords = []
+
+        # Extract named entities
+        for ent in doc.ents:
+            keywords.append(ent.text)
+
+        # Extract noun phrases
+        for chunk in doc.noun_chunks:
+            keywords.append(chunk.text)
+
+        return list(set(keywords))
+
+    def extract_skills(self, text):
+        doc = self.nlp(text)
+        skills = []
+
+        # Custom skill extraction logic
+        for token in doc:
+            if token.pos_ in ['NOUN', 'PROPN']:
+                skills.append(token.text)
+
+        return list(set(skills))
+
+    def analyze_profile(self, profile_text):
+        return {
+            'keywords': self.extract_keywords(profile_text),
+            'skills': self.extract_skills(profile_text),
+            'interests': self.extract_keywords(profile_text)  # Can be refined further
+        }
diff --git a/paper_matching/models/recommender.py b/paper_matching/models/recommender.py
@@ -0,0 +1,42 @@
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+
+class PersonalizedRecommender:
+    def __init__(self):
+        self.model = RandomForestClassifier()
+        self.feedback_data = []
+
+    def prepare_features(self, user_profile, paper):
+        # Combine user and paper features
+        return np.concatenate([
+            user_profile['embeddings'],
+            paper['embeddings']
+        ])
+
+    def train(self, training_data):
+        X = []
+        y = []
+
+        for item in training_data:
+            features = self.prepare_features(item['user'], item['paper'])
+            X.append(features)
+            y.append(item['rating'])
+
+        self.model.fit(X, y)
+
+    def predict(self, user_profile, papers):
+        predictions = []
+
+        for paper in papers:
+            features = self.prepare_features(user_profile, paper)
+            score = self.model.predict_proba([features])[0][1]
+            predictions.append((paper, score))
+
+        return sorted(predictions, key=lambda x: x[1], reverse=True)
+
+    def update_feedback(self, user_id, paper_id, rating):
+        self.feedback_data.append({
+            'user_id': user_id,
+            'paper_id': paper_id,
+            'rating': rating
+        })
diff --git a/paper_matching/models/semantic_matcher.py b/paper_matching/models/semantic_matcher.py
@@ -0,0 +1,25 @@
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+class SemanticMatcher:
+    def __init__(self):
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+
+    def compute_embeddings(self, texts):
+        return self.model.encode(texts)
+
+    def compute_similarity(self, embedding1, embedding2):
+        return cosine_similarity(
+            embedding1.reshape(1, -1),
+            embedding2.reshape(1, -1)
+        )[0][0]
+
+    def find_matches(self, query_embedding, candidate_embeddings, top_k=5):
+        similarities = cosine_similarity(
+            query_embedding.reshape(1, -1),
+            candidate_embeddings
+        )[0]
+
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+        return top_indices, similarities[top_indices]
diff --git a/paper_matching/preprocessing/__init__.py b/paper_matching/preprocessing/__init__.py
@@ -0,0 +1 @@
+from .data_preprocessor import DataPreprocessor
diff --git a/paper_matching/preprocessing/data_preprocessor.py b/paper_matching/preprocessing/data_preprocessor.py
@@ -0,0 +1,34 @@
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+class DataPreprocessor:
+    def __init__(self):
+        nltk.download('punkt')
+        nltk.download('stopwords')
+        nltk.download('wordnet')
+        self.stop_words = set(stopwords.words('english'))
+        self.vectorizer = TfidfVectorizer(max_features=5000)
+
+    def clean_text(self, text):
+        # Tokenize and clean text
+        tokens = word_tokenize(text.lower())
+        tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
+        return ' '.join(tokens)
+
+    def process_papers(self, papers_df):
+        # Process research papers
+        papers_df['processed_text'] = papers_df['abstract'].apply(self.clean_text)
+        papers_df['processed_title'] = papers_df['title'].apply(self.clean_text)
+        return papers_df
+
+    def process_profiles(self, profiles_df):
+        # Process user profiles
+        profiles_df['processed_interests'] = profiles_df['interests'].apply(self.clean_text)
+        profiles_df['processed_skills'] = profiles_df['skills'].apply(self.clean_text)
+        return profiles_df
+
+    def vectorize_text(self, text_series):
+        return self.vectorizer.fit_transform(text_series)