diff --git a/paper_matching/api/__init__.py b/paper_matching/api/__init__.py new file mode 100644 index 0000000..1151fa5 --- /dev/null +++ b/paper_matching/api/__init__.py @@ -0,0 +1,5 @@ +from flask import Flask + +app = Flask(__name__) + +from . import routes \ No newline at end of file diff --git a/paper_matching/api/routes.py b/paper_matching/api/routes.py new file mode 100644 index 0000000..48014cf --- /dev/null +++ b/paper_matching/api/routes.py @@ -0,0 +1,53 @@ +from flask import Flask, request, jsonify +from models.profile_analyzer import ProfileAnalyzer +from models.semantic_matcher import SemanticMatcher +from models.recommender import PersonalizedRecommender + +app = Flask(__name__) + +profile_analyzer = ProfileAnalyzer() +semantic_matcher = SemanticMatcher() +recommender = PersonalizedRecommender() + +@app.route('/analyze_profile', methods=['POST']) +def analyze_profile(): + data = request.json + profile_text = data.get('profile_text') + + if not profile_text: + return jsonify({'error': 'Profile text is required'}), 400 + + analysis = profile_analyzer.analyze_profile(profile_text) + return jsonify(analysis) + +@app.route('/match_papers', methods=['POST']) +def match_papers(): + data = request.json + user_profile = data.get('user_profile') + papers = data.get('papers') + + if not user_profile or not papers: + return jsonify({'error': 'Both user profile and papers are required'}), 400 + + # Get embeddings + profile_embedding = semantic_matcher.compute_embeddings([user_profile])[0] + paper_embeddings = semantic_matcher.compute_embeddings(papers) + + # Find matches + matches, scores = semantic_matcher.find_matches(profile_embedding, paper_embeddings) + + return jsonify({ + 'matches': [papers[i] for i in matches], + 'scores': scores.tolist() + }) + +@app.route('/get_recommendations', methods=['POST']) +def get_recommendations(): + data = request.json + user_id = data.get('user_id') + + if not user_id: + return jsonify({'error': 'User ID is required'}), 400 + + recommendations = recommender.predict( + user_profile) \ No newline at end of file diff --git a/paper_matching/data/processed/.gitkeep b/paper_matching/data/processed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/paper_matching/data/processed/processed_papers.csv b/paper_matching/data/processed/processed_papers.csv new file mode 100644 index 0000000..6a8220b --- /dev/null +++ b/paper_matching/data/processed/processed_papers.csv @@ -0,0 +1,4 @@ +paper_id,title,abstract,authors,keywords,publication_date,field_of_study,processed_text,processed_title,embeddings +1,"Deep Learning Approaches in Healthcare","This paper reviews...",..."deep learning healthcare ai"..."deep learning approaches healthcare",[0.123, 0.456, ...] +2,"Climate Change Impact Analysis","A comprehensive study...",..."climate change environment analysis"..."climate change impact analysis",[0.234, 0.567, ...] +... \ No newline at end of file diff --git a/paper_matching/data/processed/processed_profiles.csv b/paper_matching/data/processed/processed_profiles.csv new file mode 100644 index 0000000..9f2ee87 --- /dev/null +++ b/paper_matching/data/processed/processed_profiles.csv @@ -0,0 +1,4 @@ +user_id,name,email,interests,skills,academic_background,research_experience,processed_interests,processed_skills,embeddings +1,"Alex Thompson",...,"ai machine learning healthcare","python tensorflow data analysis",...,[0.111, 0.222, ...] +2,"Maria Garcia",...,"climate science environmental studies","r gis statistical analysis",...,[0.333, 0.444, ...] +... \ No newline at end of file diff --git a/paper_matching/data/raw/.gitignore b/paper_matching/data/raw/.gitignore new file mode 100644 index 0000000..2a85e88 --- /dev/null +++ b/paper_matching/data/raw/.gitignore @@ -0,0 +1,18 @@ +# Data directories +data/raw/* +data/processed/* +!data/raw/.gitkeep +!data/processed/.gitkeep + +# Python +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environment +venv/ +env/ + +# IDE +.vscode/ +.idea/ \ No newline at end of file diff --git a/paper_matching/data/raw/.gitkeep b/paper_matching/data/raw/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/paper_matching/data/raw/papers.csv b/paper_matching/data/raw/papers.csv new file mode 100644 index 0000000..211cc77 --- /dev/null +++ b/paper_matching/data/raw/papers.csv @@ -0,0 +1,6 @@ +paper_id,title,abstract,authors,keywords,publication_date,field_of_study +1,"Deep Learning Approaches in Healthcare","This paper reviews deep learning methods in healthcare applications...","John Smith, Jane Doe","deep learning, healthcare, AI","2023-01-15","Computer Science" +2,"Climate Change Impact Analysis","A comprehensive study of climate change effects on global ecosystems...","Alice Johnson, Bob Wilson","climate change, environment, analysis","2023-02-20","Environmental Science" +3,"Quantum Computing Advances","Recent developments in quantum computing and their implications...","David Brown, Sarah Lee","quantum computing, physics, technology","2023-03-10","Physics" +4,"Machine Learning in Finance","Applications of machine learning algorithms in financial markets...","Mike Chen, Lisa Wang","machine learning, finance, algorithms","2023-04-05","Finance" +5,"Renewable Energy Systems","Analysis of modern renewable energy technologies...","Emma Davis, Tom Miller","renewable energy, sustainability","2023-05-12","Engineering" \ No newline at end of file diff --git a/paper_matching/data/raw/user_profiles.csv b/paper_matching/data/raw/user_profiles.csv new file mode 100644 index 0000000..54916a8 --- /dev/null +++ b/paper_matching/data/raw/user_profiles.csv @@ -0,0 +1,6 @@ +user_id,name,email,interests,skills,academic_background,research_experience +1,"Alex Thompson","alex.t@email.com","AI, Machine Learning, Healthcare","Python, TensorFlow, Data Analysis","PhD in Computer Science","5 years in AI research" +2,"Maria Garcia","m.garcia@email.com","Climate Science, Environmental Studies","R, GIS, Statistical Analysis","MSc in Environmental Science","3 years in climate research" +3,"James Wilson","j.wilson@email.com","Quantum Physics, Computing","Quantum Algorithms, Mathematics, C++","PhD in Physics","7 years in quantum computing" +4,"Sophie Chen","s.chen@email.com","Finance, Machine Learning","Python, Financial Modeling, Deep Learning","MBA, MSc in Data Science","4 years in fintech" +5,"Ryan Peters","r.peters@email.com","Renewable Energy, Sustainability","Engineering Design, Solar Systems","MEng in Energy Systems","6 years in energy sector" \ No newline at end of file diff --git a/paper_matching/main.py b/paper_matching/main.py new file mode 100644 index 0000000..ace3c0a --- /dev/null +++ b/paper_matching/main.py @@ -0,0 +1,50 @@ +import pandas as pd +from preprocessing import DataPreprocessor +from models import ProfileAnalyzer, SemanticMatcher, PersonalizedRecommender +from api import app + +def load_data(): + # Load your data here (this is just an example) + papers = pd.read_csv('data/raw/papers.csv') + profiles = pd.read_csv('data/raw/user_profiles.csv') + return papers, profiles + +def preprocess_data(papers, profiles): + preprocessor = DataPreprocessor() + processed_papers = preprocessor.process_papers(papers) + processed_profiles = preprocessor.process_profiles(profiles) + return processed_papers, processed_profiles + +def initialize_models(): + profile_analyzer = ProfileAnalyzer() + semantic_matcher = SemanticMatcher() + recommender = PersonalizedRecommender() + return profile_analyzer, semantic_matcher, recommender + +def main(): + # Load and preprocess data + papers, profiles = load_data() + processed_papers, processed_profiles = preprocess_data(papers, profiles) + + # Initialize models + profile_analyzer, semantic_matcher, recommender = initialize_models() + + # Compute embeddings for papers and profiles + paper_embeddings = semantic_matcher.compute_embeddings(processed_papers['processed_text']) + profile_embeddings = semantic_matcher.compute_embeddings(processed_profiles['processed_interests']) + + # Add embeddings to the dataframes + processed_papers['embeddings'] = paper_embeddings.tolist() + processed_profiles['embeddings'] = profile_embeddings.tolist() + + # Save processed data + processed_papers.to_csv('data/processed/processed_papers.csv', index=False) + processed_profiles.to_csv('data/processed/processed_profiles.csv', index=False) + + print("Data preprocessing and model initialization complete.") + + # Run the Flask app + app.run(debug=True) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/paper_matching/models/__init__.py b/paper_matching/models/__init__.py new file mode 100644 index 0000000..7a01c43 --- /dev/null +++ b/paper_matching/models/__init__.py @@ -0,0 +1,3 @@ +from .profile_analyzer import ProfileAnalyzer +from .semantic_matcher import SemanticMatcher +from .recommender import PersonalizedRecommender \ No newline at end of file diff --git a/paper_matching/models/profile_analyzer.py b/paper_matching/models/profile_analyzer.py new file mode 100644 index 0000000..1aa32ea --- /dev/null +++ b/paper_matching/models/profile_analyzer.py @@ -0,0 +1,40 @@ +import spacy +from sklearn.feature_extraction.text import TfidfVectorizer +from collections import Counter + +class ProfileAnalyzer: + def __init__(self): + self.nlp = spacy.load('en_core_web_sm') + self.keyword_extractor = TfidfVectorizer(max_features=100) + + def extract_keywords(self, text): + doc = self.nlp(text) + keywords = [] + + # Extract named entities + for ent in doc.ents: + keywords.append(ent.text) + + # Extract noun phrases + for chunk in doc.noun_chunks: + keywords.append(chunk.text) + + return list(set(keywords)) + + def extract_skills(self, text): + doc = self.nlp(text) + skills = [] + + # Custom skill extraction logic + for token in doc: + if token.pos_ in ['NOUN', 'PROPN']: + skills.append(token.text) + + return list(set(skills)) + + def analyze_profile(self, profile_text): + return { + 'keywords': self.extract_keywords(profile_text), + 'skills': self.extract_skills(profile_text), + 'interests': self.extract_keywords(profile_text) # Can be refined further + } \ No newline at end of file diff --git a/paper_matching/models/recommender.py b/paper_matching/models/recommender.py new file mode 100644 index 0000000..d9c5a8f --- /dev/null +++ b/paper_matching/models/recommender.py @@ -0,0 +1,42 @@ +from sklearn.ensemble import RandomForestClassifier +import numpy as np + +class PersonalizedRecommender: + def __init__(self): + self.model = RandomForestClassifier() + self.feedback_data = [] + + def prepare_features(self, user_profile, paper): + # Combine user and paper features + return np.concatenate([ + user_profile['embeddings'], + paper['embeddings'] + ]) + + def train(self, training_data): + X = [] + y = [] + + for item in training_data: + features = self.prepare_features(item['user'], item['paper']) + X.append(features) + y.append(item['rating']) + + self.model.fit(X, y) + + def predict(self, user_profile, papers): + predictions = [] + + for paper in papers: + features = self.prepare_features(user_profile, paper) + score = self.model.predict_proba([features])[0][1] + predictions.append((paper, score)) + + return sorted(predictions, key=lambda x: x[1], reverse=True) + + def update_feedback(self, user_id, paper_id, rating): + self.feedback_data.append({ + 'user_id': user_id, + 'paper_id': paper_id, + 'rating': rating + }) \ No newline at end of file diff --git a/paper_matching/models/semantic_matcher.py b/paper_matching/models/semantic_matcher.py new file mode 100644 index 0000000..c1309df --- /dev/null +++ b/paper_matching/models/semantic_matcher.py @@ -0,0 +1,25 @@ +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + +class SemanticMatcher: + def __init__(self): + self.model = SentenceTransformer('all-MiniLM-L6-v2') + + def compute_embeddings(self, texts): + return self.model.encode(texts) + + def compute_similarity(self, embedding1, embedding2): + return cosine_similarity( + embedding1.reshape(1, -1), + embedding2.reshape(1, -1) + )[0][0] + + def find_matches(self, query_embedding, candidate_embeddings, top_k=5): + similarities = cosine_similarity( + query_embedding.reshape(1, -1), + candidate_embeddings + )[0] + + top_indices = np.argsort(similarities)[::-1][:top_k] + return top_indices, similarities[top_indices] \ No newline at end of file diff --git a/paper_matching/preprocessing/__init__.py b/paper_matching/preprocessing/__init__.py new file mode 100644 index 0000000..421a8ca --- /dev/null +++ b/paper_matching/preprocessing/__init__.py @@ -0,0 +1 @@ +from .data_preprocessor import DataPreprocessor \ No newline at end of file diff --git a/paper_matching/preprocessing/data_preprocessor.py b/paper_matching/preprocessing/data_preprocessor.py new file mode 100644 index 0000000..ad7367d --- /dev/null +++ b/paper_matching/preprocessing/data_preprocessor.py @@ -0,0 +1,34 @@ +import pandas as pd +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from sklearn.feature_extraction.text import TfidfVectorizer + +class DataPreprocessor: + def __init__(self): + nltk.download('punkt') + nltk.download('stopwords') + nltk.download('wordnet') + self.stop_words = set(stopwords.words('english')) + self.vectorizer = TfidfVectorizer(max_features=5000) + + def clean_text(self, text): + # Tokenize and clean text + tokens = word_tokenize(text.lower()) + tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words] + return ' '.join(tokens) + + def process_papers(self, papers_df): + # Process research papers + papers_df['processed_text'] = papers_df['abstract'].apply(self.clean_text) + papers_df['processed_title'] = papers_df['title'].apply(self.clean_text) + return papers_df + + def process_profiles(self, profiles_df): + # Process user profiles + profiles_df['processed_interests'] = profiles_df['interests'].apply(self.clean_text) + profiles_df['processed_skills'] = profiles_df['skills'].apply(self.clean_text) + return profiles_df + + def vectorize_text(self, text_series): + return self.vectorizer.fit_transform(text_series) \ No newline at end of file diff --git a/paper_matching/readme.md b/paper_matching/readme.md new file mode 100644 index 0000000..3f2fffc --- /dev/null +++ b/paper_matching/readme.md @@ -0,0 +1,182 @@ +# AI-Powered Academic Paper Matching System + +![GitHub](https://img.shields.io/github/license/yourusername/paper-matching-system) +![Python](https://img.shields.io/badge/python-3.8%2B-blue) +![Last Commit](https://img.shields.io/github/last-commit/yourusername/paper-matching-system) + +A sophisticated AI-powered system that matches academic papers with researchers based on their interests, expertise, and research background. The system employs natural language processing, machine learning, and semantic analysis to provide highly relevant paper recommendations. + +## ๐ŸŒŸ Key Features + +- **Intelligent Profile Analysis**: Automatically analyzes researcher profiles to understand their interests and expertise +- **Semantic Paper Matching**: Uses advanced NLP techniques to match papers with researchers +- **Personalized Recommendations**: Delivers tailored paper suggestions based on individual research profiles +- **RESTful API Integration**: Easy-to-use API endpoints for seamless integration +- **Scalable Architecture**: Designed to handle large volumes of papers and users +- **Real-time Updates**: Dynamic updating of recommendations as new papers are added + +## ๐Ÿ› ๏ธ Technology Stack + +- **Backend**: Python 3.8+ +- **API Framework**: Flask +- **ML/NLP**: scikit-learn, NLTK, TensorFlow +- **Data Processing**: pandas, numpy +- **Database**: SQLite (default), PostgreSQL (optional) +- **Testing**: pytest +- **Documentation**: Sphinx + +## ๐Ÿ“ Project Structure +paper-matching-system/ โ”‚ โ”œโ”€โ”€ api/ # API endpoints and routing โ”‚ โ”œโ”€โ”€ init.py โ”‚ โ””โ”€โ”€ routes.py โ”‚ โ”œโ”€โ”€ models/ # Core matching and recommendation models โ”‚ โ”œโ”€โ”€ init.py โ”‚ โ”œโ”€โ”€ profile_analyzer.py โ”‚ โ”œโ”€โ”€ semantic_matcher.py โ”‚ โ””โ”€โ”€ recommender.py โ”‚ โ”œโ”€โ”€ preprocessing/ # Data preprocessing utilities โ”‚ โ”œโ”€โ”€ init.py โ”‚ โ””โ”€โ”€ data_preprocessor.py โ”‚ โ”œโ”€โ”€ utils/ # Helper functions and utilities โ”‚ โ”œโ”€โ”€ init.py โ”‚ โ””โ”€โ”€ helpers.py โ”‚ โ”œโ”€โ”€ data/ # Data storage โ”‚ โ”œโ”€โ”€ raw/ # Original data files โ”‚ โ””โ”€โ”€ processed/ # Processed data files โ”‚ โ”œโ”€โ”€ tests/ # Test suite โ”‚ โ”œโ”€โ”€ init.py โ”‚ โ”œโ”€โ”€ test_preprocessor.py โ”‚ โ”œโ”€โ”€ test_matcher.py โ”‚ โ””โ”€โ”€ test_api.py โ”‚ โ”œโ”€โ”€ docs/ # Documentation โ”œโ”€โ”€ main.py # Application entry point โ”œโ”€โ”€ data_generator.py # Sample data generator โ”œโ”€โ”€ requirements.txt # Project dependencies โ”œโ”€โ”€ config.py # Configuration settings โ””โ”€โ”€ README.md + +## ๐Ÿš€ Getting Started + +### Prerequisites + +- Python 3.8 or higher +- pip package manager +- Virtual environment (recommended) + +### Installation + +1. Clone the repository: +```bash +git clone https://github.com/Harshdev098/paper_matching.git +cd paper_matching + +2. Create and activate virtual environment: +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +3. Install dependencies: +```bash +pip install -r requirements.txt + +4. Generate sample data: +```bash +python data_generator.py + +5. Start the application: +```bash +python main.py + +๐Ÿ’ป API Usage +Authentication +```python +import requests + +API_KEY = "your_api_key" +headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json" +} + +Get Paper Recommendations +```python +# Get recommendations for a specific user +response = requests.get( + "http://localhost:5000/api/recommend/123", + headers=headers +) +recommendations = response.json() + +Update User Profile +```python +profile_data = { + "user_id": "123", + "interests": ["machine learning", "natural language processing"], + "skills": ["python", "tensorflow"] +} + +response = requests.post( + "http://localhost:5000/api/update_profile", + json=profile_data, + headers=headers +) + +๐Ÿ“Š Data Formats +User Profile Schema +```json +{ + "user_id": "string", + "name": "string", + "email": "string", + "interests": ["string"], + "skills": ["string"], + "academic_background": "string", + "research_experience": "string" +} + +Paper Schema +```json +{ + "paper_id": "string", + "title": "string", + "abstract": "string", + "authors": ["string"], + "keywords": ["string"], + "publication_date": "string", + "field_of_study": "string" +} + +๐Ÿ”ง Configuration +Edit config.py to customize: + +API settings +Database configuration +Matching algorithm parameters +Recommendation thresholds +Logging settings + +๐Ÿงช Testing +Run the test suite: +```bash +pytest tests/ + +Generate coverage report: +```bash +pytest --cov=. tests/ + +๐Ÿค Contributing +Fork the repository +Create your feature branch (git checkout -b feature/AmazingFeature) +Commit your changes (git commit -m 'Add some AmazingFeature') +Push to the branch (git push origin feature/AmazingFeature) +Open a Pull Request + +Contribution Guidelines +Follow PEP 8 style guide +Add unit tests for new features +Update documentation +Maintain test coverage above 80% + +๐Ÿ”„ Version History +0.2.0 + Enhanced matching algorithm + Added API authentication + Performance improvements +0.1.0 + Initial Release + + +This README provides: + +1. Clear project overview and features +2. Detailed installation instructions +3. Comprehensive API documentation +4. Data format specifications +5. Testing and contribution guidelines +6. Future development plans +7. Support information +8. Version history + +Remember to: +- Replace placeholder URLs and usernames +- Add actual badges +- Update contact information +- Modify features and requirements based on your implementation +- Add specific examples relevant to your system +- Include actual performance metrics and benchmarks +- Update version history as you release new versions + +This README serves as both documentation and a project overview, making it easier for users and contributors to understand and use your system. \ No newline at end of file diff --git a/paper_matching/utils/__init__.py b/paper_matching/utils/__init__.py new file mode 100644 index 0000000..0fa5cf9 --- /dev/null +++ b/paper_matching/utils/__init__.py @@ -0,0 +1 @@ +# I am leaving this empty for now, you can add any utility functions if needed \ No newline at end of file diff --git a/paper_matching/utils/helpers.py b/paper_matching/utils/helpers.py new file mode 100644 index 0000000..ed91086 --- /dev/null +++ b/paper_matching/utils/helpers.py @@ -0,0 +1,237 @@ +import re +import json +import numpy as np +from datetime import datetime +from typing import List, Dict, Union, Any +import logging +import pandas as pd +from sklearn.metrics.pairwise import cosine_similarity +from pathlib import Path + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +class DataValidator: + """Validates data formats and contents""" + + @staticmethod + def validate_paper(paper: Dict[str, Any]) -> bool: + """ + Validates paper data structure + """ + required_fields = ['paper_id', 'title', 'abstract', 'authors', 'keywords'] + return all(field in paper for field in required_fields) + + @staticmethod + def validate_profile(profile: Dict[str, Any]) -> bool: + """ + Validates user profile data structure + """ + required_fields = ['user_id', 'interests', 'skills'] + return all(field in profile for field in required_fields) + +class TextProcessor: + """Text processing utilities""" + + @staticmethod + def clean_text(text: str) -> str: + """ + Clean and normalize text + """ + if not isinstance(text, str): + return "" + + # Convert to lowercase + text = text.lower() + + # Remove special characters + text = re.sub(r'[^\w\s]', '', text) + + # Remove extra whitespace + text = re.sub(r'\s+', ' ', text).strip() + + return text + + @staticmethod + def extract_keywords(text: str) -> List[str]: + """ + Extract keywords from text + """ + # Simple keyword extraction (can be enhanced with NLP techniques) + words = text.split() + # Remove common words (you might want to use a proper stop words list) + common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'} + keywords = [word for word in words if word not in common_words] + return keywords + +class SimilarityCalculator: + """Calculate similarities between vectors""" + + @staticmethod + def cosine_similarity_score(vec1: np.ndarray, vec2: np.ndarray) -> float: + """ + Calculate cosine similarity between two vectors + """ + if vec1.size == 0 or vec2.size == 0: + return 0.0 + return float(cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]) + + @staticmethod + def calculate_overlap_score(set1: set, set2: set) -> float: + """ + Calculate Jaccard similarity between two sets + """ + if not set1 or not set2: + return 0.0 + return len(set1.intersection(set2)) / len(set1.union(set2)) + +class FileHandler: + """Handle file operations""" + + @staticmethod + def save_to_json(data: Union[List, Dict], filepath: str) -> None: + """ + Save data to JSON file + """ + try: + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=4) + logger.info(f"Successfully saved data to {filepath}") + except Exception as e: + logger.error(f"Error saving to JSON: {str(e)}") + raise + + @staticmethod + def load_from_json(filepath: str) -> Union[List, Dict]: + """ + Load data from JSON file + """ + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + logger.info(f"Successfully loaded data from {filepath}") + return data + except Exception as e: + logger.error(f"Error loading JSON: {str(e)}") + raise + +class ResultsManager: + """Manage and format matching results""" + + @staticmethod + def format_match_results(matches: List[Dict], scores: List[float]) -> List[Dict]: + """ + Format matching results with scores + """ + return [ + { + 'paper': match, + 'score': float(score), + 'timestamp': datetime.now().isoformat() + } + for match, score in zip(matches, scores) + ] + + @staticmethod + def filter_results(results: List[Dict], threshold: float = 0.5) -> List[Dict]: + """ + Filter results based on similarity threshold + """ + return [result for result in results if result['score'] >= threshold] + +class PerformanceMetrics: + """Calculate and track system performance metrics""" + + @staticmethod + def calculate_precision_at_k(relevant_items: set, recommended_items: List, k: int) -> float: + """ + Calculate precision@k for recommendations + """ + if not recommended_items or k <= 0: + return 0.0 + + recommended_k = set(recommended_items[:k]) + relevant_and_recommended = relevant_items.intersection(recommended_k) + + return len(relevant_and_recommended) / k + + @staticmethod + def calculate_recall_at_k(relevant_items: set, recommended_items: List, k: int) -> float: + """ + Calculate recall@k for recommendations + """ + if not relevant_items or not recommended_items or k <= 0: + return 0.0 + + recommended_k = set(recommended_items[:k]) + relevant_and_recommended = relevant_items.intersection(recommended_k) + + return len(relevant_and_recommended) / len(relevant_items) + +class DataExporter: + """Export data in various formats""" + + @staticmethod + def export_to_csv(data: List[Dict], filepath: str) -> None: + """ + Export results to CSV + """ + try: + df = pd.DataFrame(data) + df.to_csv(filepath, index=False) + logger.info(f"Successfully exported data to {filepath}") + except Exception as e: + logger.error(f"Error exporting to CSV: {str(e)}") + raise + + @staticmethod + def export_to_excel(data: List[Dict], filepath: str) -> None: + """ + Export results to Excel + """ + try: + df = pd.DataFrame(data) + df.to_excel(filepath, index=False) + logger.info(f"Successfully exported data to {filepath}") + except Exception as e: + logger.error(f"Error exporting to Excel: {str(e)}") + raise + +def create_directory_if_not_exists(directory: str) -> None: + """ + Create directory if it doesn't exist + """ + Path(directory).mkdir(parents=True, exist_ok=True) + +def get_file_extension(filename: str) -> str: + """ + Get file extension from filename + """ + return Path(filename).suffix.lower() + +def is_valid_file_type(filename: str, allowed_extensions: set) -> bool: + """ + Check if file type is allowed + """ + return get_file_extension(filename) in allowed_extensions + +# Example usage: +if __name__ == "__main__": + # Test text processing + text_processor = TextProcessor() + cleaned_text = text_processor.clean_text("This is a TEST string!!!") + print(f"Cleaned text: {cleaned_text}") + + # Test similarity calculation + calc = SimilarityCalculator() + vec1 = np.array([1, 2, 3]) + vec2 = np.array([2, 4, 6]) + similarity = calc.cosine_similarity_score(vec1, vec2) + print(f"Similarity score: {similarity}") + + # Test data validation + DataValidator \ No newline at end of file