diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d0902ce --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.DS_Store +*.sqlite3 +data/ +library/migrations/ +*.hypothesis* +*__pycache__* diff --git a/WheresMyField/.DS_Store b/WheresMyField/.DS_Store new file mode 100644 index 0000000..d087523 Binary files /dev/null and b/WheresMyField/.DS_Store differ diff --git a/WheresMyField/__init__.py b/WheresMyField/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/WheresMyField/__pycache__/__init__.cpython-35.pyc b/WheresMyField/__pycache__/__init__.cpython-35.pyc new file mode 100644 index 0000000..08ead89 Binary files /dev/null and b/WheresMyField/__pycache__/__init__.cpython-35.pyc differ diff --git a/WheresMyField/__pycache__/settings.cpython-35.pyc b/WheresMyField/__pycache__/settings.cpython-35.pyc new file mode 100644 index 0000000..39bbcba Binary files /dev/null and b/WheresMyField/__pycache__/settings.cpython-35.pyc differ diff --git a/WheresMyField/__pycache__/urls.cpython-35.pyc b/WheresMyField/__pycache__/urls.cpython-35.pyc new file mode 100644 index 0000000..e5c8bef Binary files /dev/null and b/WheresMyField/__pycache__/urls.cpython-35.pyc differ diff --git a/WheresMyField/__pycache__/wsgi.cpython-35.pyc b/WheresMyField/__pycache__/wsgi.cpython-35.pyc new file mode 100644 index 0000000..9951bec Binary files /dev/null and b/WheresMyField/__pycache__/wsgi.cpython-35.pyc differ diff --git a/WheresMyField/settings.py b/WheresMyField/settings.py new file mode 100644 index 0000000..8be8746 --- /dev/null +++ b/WheresMyField/settings.py @@ -0,0 +1,128 @@ +""" +Django settings for WheresMyField project. + +Generated by 'django-admin startproject' using Django 1.9.9. + +For more information on this file, see +https://docs.djangoproject.com/en/1.9/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/1.9/ref/settings/ +""" + +import os + +# Build paths inside the project like this: os.path.join(BASE_DIR, ...) +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'sqo*d$_e9^l5ah%3p50tr#ui1k9-6&b=3ac7qze56^toty9yr1' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + 'DEFAULT_PERMISSION_CLASSES': ('rest_framework.permissions.IsAdminUser',), + 'PAGE_SIZE': 10 +} + +# Application definition + +INSTALLED_APPS = [ + 'rest_framework', + 'library.apps.LibraryConfig', + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'bootstrap3', +] + +MIDDLEWARE_CLASSES = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'WheresMyField.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'WheresMyField.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/1.9/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } +} + + +# Password validation +# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/1.9/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_L10N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/1.9/howto/static-files/ + +STATIC_URL = '/static/' + diff --git a/WheresMyField/urls.py b/WheresMyField/urls.py new file mode 100644 index 0000000..d376cd7 --- /dev/null +++ b/WheresMyField/urls.py @@ -0,0 +1,18 @@ +from django.contrib import admin +from django.conf.urls import url, include +from rest_framework import routers +from library import views + +router = routers.DefaultRouter() +router.register(r'article', views.ArticleViewSet) +router.register(r'author', views.AuthorViewSet) +router.register(r'year', views.YearViewSet) +router.register(r'label', views.LabelViewSet) +router.register(r'keyword', views.KeyWordViewSet) +router.register(r'strategies', views.StrategiesViewSet) + +urlpatterns = [ + url(r'^index', views.viewDashboard), + url(r'^admin/', admin.site.urls), + url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')) +] diff --git a/WheresMyField/wsgi.py b/WheresMyField/wsgi.py new file mode 100644 index 0000000..ca0b14f --- /dev/null +++ b/WheresMyField/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for WheresMyField project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "WheresMyField.settings") + +application = get_wsgi_application() diff --git a/db.sqlite3 b/db.sqlite3 new file mode 100644 index 0000000..6f9fa63 Binary files /dev/null and b/db.sqlite3 differ diff --git a/library/.DS_Store b/library/.DS_Store new file mode 100644 index 0000000..61c080b Binary files /dev/null and b/library/.DS_Store differ diff --git a/library/__init__.py b/library/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/library/__pycache__/__init__.cpython-35.pyc b/library/__pycache__/__init__.cpython-35.pyc new file mode 100644 index 0000000..3a054b2 Binary files /dev/null and b/library/__pycache__/__init__.cpython-35.pyc differ diff --git a/library/__pycache__/admin.cpython-35.pyc b/library/__pycache__/admin.cpython-35.pyc new file mode 100644 index 0000000..b42192c Binary files /dev/null and b/library/__pycache__/admin.cpython-35.pyc differ diff --git a/library/__pycache__/apps.cpython-35.pyc b/library/__pycache__/apps.cpython-35.pyc new file mode 100644 index 0000000..d308203 Binary files /dev/null and b/library/__pycache__/apps.cpython-35.pyc differ diff --git a/library/__pycache__/models.cpython-35.pyc b/library/__pycache__/models.cpython-35.pyc new file mode 100644 index 0000000..74918a0 Binary files /dev/null and b/library/__pycache__/models.cpython-35.pyc differ diff --git a/library/__pycache__/serializers.cpython-35.pyc b/library/__pycache__/serializers.cpython-35.pyc new file mode 100644 index 0000000..dd828df Binary files /dev/null and b/library/__pycache__/serializers.cpython-35.pyc differ diff --git a/library/__pycache__/views.cpython-35.pyc b/library/__pycache__/views.cpython-35.pyc new file mode 100644 index 0000000..f79cb57 Binary files /dev/null and b/library/__pycache__/views.cpython-35.pyc differ diff --git a/library/admin.py b/library/admin.py new file mode 100644 index 0000000..266cdcc --- /dev/null +++ b/library/admin.py @@ -0,0 +1,26 @@ +from django.contrib import admin + +from .models import Article, Author, Year, Label, Strategies, KeyWord + + +class ArticleAdmin(admin.ModelAdmin): + model = Article + search_fields = ['title', 'key'] + filter_horizontal = ('author', 'list_strategies', 'labels', 'key_word') + + +class AuthorAdmin(admin.ModelAdmin): + model = Author + search_fields = ['name'] + + +class StrategiesAdmin(admin.ModelAdmin): + model = Strategies + search_fields = ['strategy_name'] + +admin.site.register(Article, ArticleAdmin) +admin.site.register(Author, AuthorAdmin) +admin.site.register(Year) +admin.site.register(Label) +admin.site.register(KeyWord) +admin.site.register(Strategies, StrategiesAdmin) diff --git a/library/apps.py b/library/apps.py new file mode 100644 index 0000000..e01db0a --- /dev/null +++ b/library/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class LibraryConfig(AppConfig): + name = 'library' diff --git a/library/models.py b/library/models.py new file mode 100644 index 0000000..682bf1d --- /dev/null +++ b/library/models.py @@ -0,0 +1,156 @@ +from django.db import models +from django.core.validators import MaxValueValidator + + +class Author(models.Model): + """ + A module for representing an author + + Attributes: + ---------- + - name: Character Field + """ + name = models.CharField(max_length=200) + + class Meta: + ordering = ['name'] + + def __str__(self): + return self.name + + +class Year(models.Model): + """ + A module for representing the year of publication + + Attributes: + ---------- + - year: Positive Integer Field + + """ + year = models.PositiveIntegerField(validators=[MaxValueValidator(9999)]) + + class Meta: + ordering = ['year'] + + def __str__(self): + return "{}".format(self.year) + + +class Label(models.Model): + """ + A module for representing labels of the article. + + Labels are used mainly by me to categorize the papers + I have read. For example, Simple, Noisy, Spatial tournaments. + + Attributes: + ---------- + - label: Char Field + """ + label = models.CharField(max_length=100) + + class Meta: + ordering = ['label'] + + def __str__(self): + return self.label + + +class KeyWord(models.Model): + """ + A module for representing key words of the article. + + Key words of the article. + + Attributes: + ---------- + - key_word: Char Field + """ + key_word = models.CharField(max_length=100) + + class Meta: + ordering = ['key_word'] + + def __str__(self): + return self.key_word + + +class Strategies(models.Model): + """ + A module for representing the strategies used by the + author for his/her article research. + + Attributes: + ---------- + - strategy_name: Char Field + the name of a strategy + - description: Text Field + a simple description of the strategy + - implemented : Char Field + whether the strategy has been implemented in Axelrod python library or not + (this could possible change in the future and become Choice Field) + """ + strategy_name = models.CharField(max_length=300) + description = models.TextField(blank=True) + implemented = models.CharField(max_length=100, blank=True, null=True) + + class Meta: + ordering = ['strategy_name'] + + def __str__(self): + return self.strategy_name + + +class Article(models.Model): + """ + A module for representing an article + + Attributes: + ----------- + + - title: Text Field + the title of the article + - author: Many to Many Field + a list of authors of the article + - date: Foreign Key Field + the year of publication + - abstract: Text Field + the abstract of the article + - key: Char Field + a key for citation. Looks similar to the Mendeley key + - unique_key: Char Field + a unique key. Hash of ('Author', 'Title', 'Year', 'Abstract') + - labels: Many to Many Field + labels for the article + - pages: Integer Field + the pages the article is within the journal + - journal: Text Field + the journal the article was published + - notes: Text Field + personal notes for each article when I read it + - list_strategies: Many to Many Field + a list of strategies + - read: Boolean Field + true when I have read file, false otherwise + - key_word: Many to Many Field + a list of key words for the article + """ + title = models.TextField() + author = models.ManyToManyField(Author, blank=True) + date = models.ForeignKey(Year) + abstract = models.TextField(blank=True) + key = models.CharField(max_length=20) + unique_key = models.CharField(max_length=32, unique=True) + labels = models.ManyToManyField(Label, blank=True) + pages = models.CharField(max_length=10, blank=True) + journal = models.TextField(blank=True) + notes = models.TextField(blank=True) + list_strategies = models.ManyToManyField(Strategies, blank=True) + read = models.BooleanField(blank=True, default=False) + key_word = models.ManyToManyField(KeyWord, blank=True) + provenance = models.CharField(max_length=20, default='Manual') + + def __str__(self): + return "{} - {}".format(self.key, self.title) + diff --git a/library/serializers.py b/library/serializers.py new file mode 100644 index 0000000..f0be430 --- /dev/null +++ b/library/serializers.py @@ -0,0 +1,82 @@ +from library.models import Article, Author, Year, Label, Strategies, KeyWord +from rest_framework import serializers + + +class AuthorSerializer(serializers.HyperlinkedModelSerializer): + papers_on_this_db = serializers.SerializerMethodField() + + class Meta: + model = Author + fields = "__all__" + + def get_papers_on_this_db(self, obj): + return obj.article_set.count() + + +class YearSerializer(serializers.HyperlinkedModelSerializer): + papers_on_specific_year = serializers.SerializerMethodField() + + class Meta: + model = Year + fields = "__all__" + + def get_papers_on_specific_year(self, obj): + return obj.article_set.count() + + +class LabelsSerializer(serializers.HyperlinkedModelSerializer): + class Meta: + model = Label + fields = ["label"] + + +class KeyWordSerializer(serializers.HyperlinkedModelSerializer): + class Meta: + model = KeyWord + fields = ["key_word"] + + +class StrategiesSerializer(serializers.HyperlinkedModelSerializer): + class Meta: + model = Strategies + fields = ["strategy_name"] + + +class ArticleSerializer(serializers.HyperlinkedModelSerializer): + author = AuthorSerializer(many=True, ) + date = YearSerializer() + labels = LabelsSerializer(many=True, ) + key_word = KeyWordSerializer(many=True, ) + list_strategies = StrategiesSerializer(many=True) + + class Meta: + model = Article + fields = ('key', 'unique_key', 'title', 'author', 'date', 'abstract', + 'pages', 'journal', 'labels', 'read', 'key_word', + 'provenance', 'list_strategies') + + def create(self, validated_data): + + # Create the new article attributes + date = Year.objects.create(year=validated_data['date'].get("year")) + # create the article + article = Article(date=date, title=validated_data['title'], + abstract=validated_data['abstract'], + key=validated_data['key'], + pages=validated_data['pages'], + journal=validated_data['journal'], + provenance=validated_data['provenance']) + + article.save() + + for author in validated_data['author']: + article.author.add(Author.objects.create(name=author['name'])) + for label in validated_data['labels']: + article.labels.add(Label.objects.create(label=label['label'])) + for strategy in validated_data['list_strategies']: + article.list_strategies.add(Strategies.objects.create( + strategy_name=strategy['strategy_name'])) + for keyword in validated_data['key_word']: + article.key_word.add(KeyWord.objects.create(key_word=keyword[ + 'key_word'])) + return article diff --git a/library/templates/library/base.html b/library/templates/library/base.html new file mode 100644 index 0000000..cf9cf47 --- /dev/null +++ b/library/templates/library/base.html @@ -0,0 +1,44 @@ +{% load bootstrap3 %} + +{% bootstrap_css %} +{% bootstrap_javascript %} + +{% bootstrap_messages %} + +Popular keywords:: + + + + +Popular Authors:: + + + + +Popular Journals:: + + \ No newline at end of file diff --git a/library/templates/library/index.html b/library/templates/library/index.html new file mode 100644 index 0000000..3d1f5af --- /dev/null +++ b/library/templates/library/index.html @@ -0,0 +1 @@ +{% extends 'library/base.html' %} diff --git a/library/tests.py b/library/tests.py new file mode 100644 index 0000000..bb145b6 --- /dev/null +++ b/library/tests.py @@ -0,0 +1,168 @@ +from django.test import TestCase, Client +from hypothesis import given, settings, HealthCheck +from hypothesis.extra.django.models import models +from django.contrib.auth.models import User +from django.urls import reverse + +from .models import Author, Year, Label, Strategies, Article, KeyWord + + +class TestingEntities(TestCase): + """A class which tests whether the parameters are being passed correctly""" + def test_values(self): + axel = Author.objects.create(name='Axelrod') + year = Year.objects.create(year=1980) + label = Label.objects.create(label='A simple label') + strategy = Strategies.objects.create(strategy_name='Grumpy') + key_word = KeyWord.objects.create(key_word='Game Theory') + + # setting up the article + article = Article(date=year, title='A simple title', abstract='Blank', key='Key', pages='1-2', + journal='A Journal', unique_key='0129832', + provenance='Manual', read=True) + article.save() + article.author.add(axel) + article.labels.add(label) + article.list_strategies.add(strategy) + article.key_word.add(key_word) + + self.assertEqual(axel.name, 'Axelrod') + self.assertEqual(year.year, 1980) + self.assertEqual(label.label, 'A simple label') + self.assertEqual(strategy.strategy_name, 'Grumpy') + self.assertEqual(strategy.description, '') + self.assertEqual(article.title, 'A simple title') + self.assertEqual(article.author.all()[0], axel) + self.assertEqual(article.date, year) + self.assertEqual(article.date.year, 1980) + self.assertEqual(article.abstract, 'Blank') + self.assertEqual(article.key, 'Key') + self.assertEqual(article.labels.all()[0], label) + self.assertEqual(article.pages, '1-2') + self.assertEqual(article.journal, 'A Journal') + self.assertEqual(article.list_strategies.all()[0], strategy) + self.assertEqual(article.unique_key, '0129832') + self.assertEqual(article.provenance, 'Manual') + self.assertEqual(article.read, True) + self.assertEqual(key_word.key_word, 'Game Theory') + + +class TestFieldType(TestCase): + """A class that randomly select an article object from the data base and + tests the field types for each parameter + """ + + @settings(suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.too_slow]) + @given(models(Article, date=models(Year)), models(Author), models(Label), + models(Strategies), models(KeyWord)) + def test_with_hypothesis(self, article, author, label, strategy, key_word): + + article.author.add(author) + article.labels.add(label) + article.list_strategies.add(strategy) + article.key_word.add(key_word) + + self.assertTrue(author.article_set.filter(author=author).exists()) + self.assertTrue(label.article_set.filter(labels=label).exists()) + self.assertTrue(strategy.article_set.filter(list_strategies=strategy).exists()) + self.assertTrue(key_word.article_set.filter(key_word=key_word).exists()) + + self.assertEqual(type(article.title), str) + self.assertEqual(type(article.date.year), int) + self.assertEqual(type(article.abstract), str) + self.assertEqual(type(article.key), str) + self.assertEqual(type(article.pages), str) + self.assertEqual(type(article.journal), str) + self.assertEqual(type(article.read), bool) + + self.assertEqual(type(author.name), str) + self.assertEqual(type(label.label), str) + self.assertEqual(type(strategy.description), str) + self.assertLessEqual(len(strategy.strategy_name), 300) + + +class TestNumberOfAppearance(TestCase): + """A class which test the number of times the individual entities, such as + Author, Year, Label etc, are bein gcalled in article objects""" + + def test_n_appearance(self): + + a_name = Author.objects.create(name='A') + b_name = Author.objects.create(name='B') + a_year = Year.objects.create(year=1990) + b_year = Year.objects.create(year=1991) + label = Label.objects.create(label='Simple Label') + a_strategy = Strategies.objects.create(strategy_name='Tit For Tat') + b_strategy = Strategies.objects.create(strategy_name='Grumpy') + + # create first article + a_article = Article(date=a_year, key='Key A', unique_key='1234567890') + a_article.save() + a_article.author.add(a_name) + a_article.labels.add(label) + a_article.list_strategies.add(a_strategy) + + # create second article + b_article = Article(date=b_year, key='Key B', unique_key='1234567899') + b_article.save() + b_article.author.add(b_name) + b_article.labels.add(label) + b_article.list_strategies.add(a_strategy) + b_article.list_strategies.add(b_strategy) + + # count + count_a_name = a_name.article_set.count() + count_b_name = b_name.article_set.count() + count_a_year = a_year.article_set.count() + count_b_year = b_year.article_set.count() + count_label = label.article_set.count() + count_a_strategy = a_strategy.article_set.count() + count_b_strategy = b_strategy.article_set.count() + + self.assertEqual(count_a_name, 1) + self.assertEqual(count_b_name, 1) + self.assertEqual(count_a_year, 1) + self.assertEqual(count_b_year, 1) + self.assertEqual(count_label, 2) + self.assertEqual(count_a_strategy, 2) + self.assertEqual(count_b_strategy, 1) + + +class TestViews(TestCase): + """A class that tests whether the login was successful""" + + def setUp(self): + """Create a dummy user""" + user = User.objects.create(username='user') + user.set_password('1234') + user.save() + + def test_the_view(self): + self.client = Client() + logged_in = self.client.login(username='user', password='1234') + + # check of the login was successful + self.assertTrue(logged_in) + + # check a request + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/article/') + self.assertEqual(response.status_code, 302) + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/author/') + self.assertEqual(response.status_code, 302) + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/year/') + self.assertEqual(response.status_code, 302) + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/author/') + self.assertEqual(response.status_code, 302) + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/label/') + self.assertEqual(response.status_code, 302) + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/strategies/') + self.assertEqual(response.status_code, 302) + response = self.client.get( + 'http://127.0.0.1:8000/admin/library/something/') + self.assertEqual(response.status_code, 404) diff --git a/library/views.py b/library/views.py new file mode 100644 index 0000000..e151b50 --- /dev/null +++ b/library/views.py @@ -0,0 +1,9 @@ +from django.http import HttpResponse +from django.template import loader +from django.shortcuts import get_object_or_404, render, render_to_response + +def viewDashboard(request): + keywords = ['Game Theory', 'Prisoners Dilemma', 'Tournament'] + authors = ['Nikoleta Glynatsi', 'Prince Charles', 'Postman Pat', 'Madonna'] + journals = ['Management Science', 'Operations Research', 'EJOR'] + return render(request, 'library/index.html', {'keywords':keywords, 'authors':authors, 'journals':journals}) diff --git a/machine_learning/ml_journals.py b/machine_learning/ml_journals.py new file mode 100644 index 0000000..8580ecb --- /dev/null +++ b/machine_learning/ml_journals.py @@ -0,0 +1,63 @@ +import collections +import itertools +import nlp_tools +from operator import add +from library import Article +from sklearn.cluster import KMeans + +from sklearn.feature_extraction.text import TfidfVectorizer + + +class MachineLearning(): + """A machine learning class to output some useful information to the + user""" + def __init__(self): + self.name = 'machine-learning' + self.data = Article.obj.all() + self.co_authors = [] + + def journals(self): + """ + A method that returns that most frequent journal authors in database + publish + """ + self.journals = [article.journal for article in self.data] + self.journals_freq = collections.Counter(self.journals) + + def author_connections(self): + """ + A method which returns a list of co-authors: + """ + self.authors = [[name.name.lower() for name in art.author.all()] for + art in self.data] + + for au in self.authors: + for pair in itertools.combinations(au, 2): + if (pair[0] == pair[1]) is False: + self.co_authors.append(pair) + + def link_keywords(self): + """ + A method which will return the 5 + """ + + self.titles = [article.title for article in self.data] + self.abstracts = [article.abstract for article in self.data] + + self.text = map(add, self.titles, self.abstracts) + + + tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, + min_df=0.01, + tokenizer=nlp_tools.tokenize_text, + ngram_range=(2, 3)) + + tfidf_matrix = tfidf_vectorizer.fit_transform(self.text) + + terms = tfidf_vectorizer.get_feature_names() + + km = KMeans(n_clusters=1, max_iter=500) + km.fit(tfidf_matrix) + + order_centroids = km.cluster_centers_.argsort()[:, ::-1] + clusters = [[terms[ind] for ind in order_centroids[i, :]] for i in range(1)] diff --git a/machine_learning/nlp_tools.py b/machine_learning/nlp_tools.py new file mode 100644 index 0000000..8429ca2 --- /dev/null +++ b/machine_learning/nlp_tools.py @@ -0,0 +1,57 @@ +from spacy.en import English +from nltk.corpus import stopwords +from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS +import string + +parser = English() + +STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) +SYMBOLS = " ".join(string.punctuation).split(" ") +SPECIAL_CHAR = ["-", "'", "‘", ":", "-----", "---", '--', "...", "…", "“", + "”", "–", "—"] + + +def clean_text(text): + # get rid of newlines + text = text.strip().replace("\n", " ").replace("\r", " ") + text = text.strip().replace("\\", " ").replace("$", " ") + + # repla ce HTML symbols + text = text.replace("&", "and").replace(">", ">").replace("<", + "<") + + # lowercase + text = text.lower() + + return text + + +def tokenize_text(raw_text): + raw_text = clean_text(raw_text) + # spacy function to get tokens + tokens = parser(raw_text) + + # lemmatize + lemmas = [] + for tok in tokens: + # if tok.like_num == False: + lemmas.append( + tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_) + tokens = lemmas + + # remove stopwords & symbols + tokens = [tok for tok in tokens if tok not in STOPLIST] + tokens = [tok for tok in tokens if tok not in SYMBOLS] + tokens = [tok for tok in tokens if tok not in SPECIAL_CHAR] + + # remove spaces if they exist + while "" in tokens: + tokens.remove("") + while " " in tokens: + tokens.remove(" ") + while "\n" in tokens: + tokens.remove("\n") + while "\n\n" in tokens: + tokens.remove("\n\n") + + return tokens diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..831a073 --- /dev/null +++ b/manage.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +import os +import sys + +if __name__ == "__main__": + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "WheresMyField.settings") + + from django.core.management import execute_from_command_line + + execute_from_command_line(sys.argv) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f2191d1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +appdirs==1.4.3 +beautifulsoup4==4.5.3 +coverage==4.3.4 +Django==1.10.6 +django-bootstrap3==8.2.1 +django-filter==1.0.2 +djangorestframework==3.6.2 +Faker==0.7.10 +hypothesis==3.7.0 +Markdown==2.6.8 +packaging==16.8 +pyparsing==2.2.0 +python-dateutil==2.6.0 +pytz==2016.10 +six==1.10.0 diff --git a/scraping.py b/scraping.py new file mode 100644 index 0000000..fc42d21 --- /dev/null +++ b/scraping.py @@ -0,0 +1,30 @@ +import arcas +from tqdm import tqdm + +keywords = ['sustainable software', 'replicability', 'reproducibility', + 'dopamine'] + +pbar = tqdm(total=(len(keywords) * 5)) +for p in [arcas.Plos, arcas.Plos, arcas.Arxiv, arcas.Springer]: + + api = p() + start = 1 + switch = True + for key in keywords: + while switch is not False: + parameters = api.parameters_fix(title=key, records=10, start=start) + + url = api.create_url_search(parameters) + request = api.make_request(url) + root = api.get_root(request) + raw_articles = api.parse(root) + try: + for art in raw_articles: + article = api.to_dataframe(art) + api.export(article, 'data/results_{}_{}_{}.json'.format( + api.__class__.__name__, key, start)) + except: + switch = False + start += 10 + pbar.update(1) +