From 141fe162a7659754c8302c5e7d51d0cafc09b1d2 Mon Sep 17 00:00:00 2001 From: Mark Wolff Date: Wed, 7 Aug 2013 14:44:44 -0400 Subject: [PATCH 1/2] Made README.md readable Added line breaks. --- README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8d77bcf..9984a8b 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,19 @@ About ----- -[Serendip-o-matic](http://serendipomatic.org/) connects your sources to digital materials located in libraries, museums, and archives around the world. By first examining your research interests, and then identifying related content in locations such as the Digital Public Library of America (DPLA), Europeana, and Flickr Commons, our serendipity engine helps you discover photographs, documents, maps and other primary sources. - -Whether you begin with text from an article, a Wikipedia page, or a full Zotero collection, Serendip-o-matic's special algorithm extracts key terms and returns a surprising reflection of your interests. Because the tool is designed mostly for inspiration, search results aren't meant to be exhaustive, but rather suggestive, pointing you to materials you might not have discovered. At the very least, the magical input-output process helps you step back and look at your work from a new perspective. Give it a whirl. Your sources may surprise you. +[Serendip-o-matic](http://serendipomatic.org/) connects your sources to digital materials +located in libraries, museums, and archives around the world. By first examining your +research interests, and then identifying related content in locations such as the Digital +Public Library of America (DPLA), Europeana, and Flickr Commons, our serendipity engine +helps you discover photographs, documents, maps and other primary sources. + +Whether you begin with text from an article, a Wikipedia page, or a full Zotero +collection, Serendip-o-matic's special algorithm extracts key terms and returns a +surprising reflection of your interests. Because the tool is designed mostly for +inspiration, search results aren't meant to be exhaustive, but rather suggestive, +pointing you to materials you might not have discovered. At the very least, the magical +input-output process helps you step back and look at your work from a new perspective. +Give it a whirl. Your sources may surprise you. Installation notes for developers --------------------------------- From 6d48f4360602996b3fbdcbb7d543e06c0047adf4 Mon Sep 17 00:00:00 2001 From: Mark Wolff Date: Thu, 8 Aug 2013 00:54:04 -0400 Subject: [PATCH 2/2] Kluge for French stopwords. I think the problem is with NLTK: it is missing "les" and "a" from its French stopword list. --- smartstash/core/utils.py | 6 +++++- smartstash/nltk_data/corpora/stopwords/french | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/smartstash/core/utils.py b/smartstash/core/utils.py index 77285d0..cf6147c 100644 --- a/smartstash/core/utils.py +++ b/smartstash/core/utils.py @@ -33,7 +33,11 @@ def tokenize(text, lang='en'): # if language is not specified or not in our list, fall back to english - stopwords = nltk.corpus.stopwords.words(stopword_lang.get(lang, 'english')) + stopwords = nltk.corpus.stopwords.words(stopword_lang.get(lang)) + if lang == 'fr': + stopwords.append('les') + stopwords.append('a') + tokens = nltk.word_tokenize(text) words = [w.lower() for w in tokens if w.isalnum() and w.lower() not in stopwords] diff --git a/smartstash/nltk_data/corpora/stopwords/french b/smartstash/nltk_data/corpora/stopwords/french index e7cbf4c..fb34121 100644 --- a/smartstash/nltk_data/corpora/stopwords/french +++ b/smartstash/nltk_data/corpora/stopwords/french @@ -15,6 +15,7 @@ il je la le +les leur lui ma @@ -116,6 +117,7 @@ eu eue eues eus +a ai as avons