Skip to content

Commit

Permalink
Merge pull request #174 from cul-it/hotfix/0.1.1
Browse files Browse the repository at this point in the history
Hotfix/0.1.1
  • Loading branch information
mhl10 authored Apr 26, 2018
2 parents c58c2f8 + dc45bbd commit a6c3eba
Show file tree
Hide file tree
Showing 19 changed files with 183 additions and 66 deletions.
16 changes: 8 additions & 8 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "pypi"

[packages]

arxiv-base = "==0.5.1"
arxiv-base = "==0.6.1"
boto = "==2.48.0"
"boto3" = "==1.6.6"
botocore = "==1.9.6"
Expand All @@ -19,15 +19,15 @@ dataclasses = "==0.4"
docutils = "==0.14"
elasticsearch = "==6.2.0"
elasticsearch-dsl = "==6.1.0"
Flask = "==0.12.2"
"Flask-S3" = "==0.3.3"
flask = "==0.12.2"
"flask-s3" = "==0.3.3"
idna = "==2.6"
ipaddress = "==1.0.19"
itsdangerous = "==0.24"
"Jinja2" = "==2.10"
"jinja2" = "==2.10"
jmespath = "==0.9.3"
jsonschema = "==2.6.0"
MarkupSafe = "==1.0"
markupsafe = "==1.0"
mccabe = "==0.6.1"
mock = "==2.0.0"
mypy = "==0.560"
Expand All @@ -44,11 +44,11 @@ requests = "==2.18.4"
"s3transfer" = "==0.1.13"
snowballstemmer = "==1.2.1"
thrift = "==0.11.0"
thrift_connector = "==0.23"
thrift-connector = "==0.23"
typed-ast = "==1.1.0"
"urllib3" = "==1.22"
Werkzeug = "==0.13"
WTForms = "==2.1"
werkzeug = "==0.13"
wtforms = "==2.1"
bleach = "*"


Expand Down
12 changes: 6 additions & 6 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
amazon-kclpy==1.4.4
arxiv-base==0.5.1
arxiv-base==0.6.1
boto==2.48.0
boto3==1.6.6
botocore==1.9.6
Expand Down
2 changes: 1 addition & 1 deletion requirements/prod.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
amazon-kclpy==1.4.4
arxiv-base==0.5.1
arxiv-base==0.6.1
boto==2.48.0
boto3==1.6.6
botocore==1.9.6
Expand Down
2 changes: 1 addition & 1 deletion search/controllers/advanced/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,6 @@ class AdvancedSearchForm(Form):
('-submitted_date', 'Submission date (newest first)'),
('submitted_date', 'Submission date (oldest first)'),
('', 'Relevance')
], validators=[validators.Optional()], default='-announced_date_first')
], validators=[validators.Optional()], default='')
include_older_versions = BooleanField('Include older versions '
'of papers in results')
20 changes: 15 additions & 5 deletions search/controllers/simple/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,18 @@ def search(request_params: dict) -> Response:
# Fall back to form-based search.
form = SimpleSearchForm(request_params)

# Temporary workaround to support classic help search
if form.query.data and form.searchtype.data == 'help':
return {}, status.HTTP_301_MOVED_PERMANENTLY,\
{'Location': 'https://arxiv.org/help/search?method=and'
f'&format=builtin-short&sort=score&words={form.query.data}'}
if form.query.data:
# Temporary workaround to support classic help search
if form.searchtype.data == 'help':
return {}, status.HTTP_301_MOVED_PERMANENTLY,\
{'Location': 'https://arxiv.org/help/search?method=and'
f'&format=builtin-short&sort=score&words={form.query.data}'}

# Support classic "expeirmental" search
elif form.searchtype.data == 'full_text':
return {}, status.HTTP_301_MOVED_PERMANENTLY,\
{'Location': 'http://search.arxiv.org:8081/'
f'?in=&query={form.query.data}'}

q: Optional[Query]
if form.validate():
Expand Down Expand Up @@ -120,6 +127,9 @@ def search(request_params: dict) -> Response:
"search again. If this problem persists, please report it to "
"[email protected]."
) from e
except Exception as e:
print(e)
raise
else:
logger.debug('form is invalid: %s', str(form.errors))
if 'order' in form.errors or 'size' in form.errors:
Expand Down
2 changes: 1 addition & 1 deletion search/controllers/simple/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class SimpleSearchForm(Form):
('-submitted_date', 'Submission date (newest first)'),
('submitted_date', 'Submission date (oldest first)'),
('', 'Relevance')
], validators=[validators.Optional()], default='-announced_date_first')
], validators=[validators.Optional()], default='')

def validate_query(form: Form, field: StringField) -> None:
"""Validate the length of the querystring, if searchtype is set."""
Expand Down
7 changes: 5 additions & 2 deletions search/process/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ def test_id(self):
self.assertEqual(doc.id, '1234.56789')

def test_abstract(self):
"""Field ``abstract`` is populated from ``abstract``."""
meta = DocMeta(**{'paper_id': '1234.56789', 'abstract': 'abstract!'})
"""Field ``abstract`` is populated from ``abstract_utf8``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'abstract_utf8': 'abstract!'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.abstract, 'abstract!')

Expand Down
4 changes: 3 additions & 1 deletion search/process/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def _constructACMClass(meta: DocMeta) -> Optional[list]:


def _transformAuthor(author: dict) -> dict:
# TODO: we should not be stripping punctuation from the name here.
# This should be handled by the analyzer. This is related to ARXIVNG-543.
author['first_name'] = _strip_punctuation(author['first_name']).strip()
author['full_name'] = re.sub(r'\s+', ' ', f"{author['first_name']} {author['last_name']}")
author['initials'] = [pt[0] for pt in author['first_name'].split() if pt]
Expand Down Expand Up @@ -81,7 +83,7 @@ def _constructDOI(meta: DocMeta) -> List[str]:
TransformType = Union[str, Callable]
_transformations: List[Tuple[str, TransformType, bool]] = [
("id", lambda meta: meta.paper_id if meta.is_current else _constructPaperVersion(meta), True),
("abstract", "abstract", False),
("abstract", "abstract_utf8", False),
("authors", _constructAuthors, True),
("authors_freeform", "authors_utf8", False),
("owners", _constructAuthorOwners, False),
Expand Down
3 changes: 0 additions & 3 deletions search/services/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,6 @@ def handle_es_exceptions() -> Generator:
class SearchSession(object):
"""Encapsulates session with Elasticsearch host."""

# TODO: we need to take on security considerations here. Presumably we will
# use SSL. Presumably we will use HTTP Auth, or something else.

def __init__(self, host: str, index: str, port: int=9200,
scheme: str='http', user: Optional[str]=None,
password: Optional[str]=None, mapping: Optional[str]=None,
Expand Down
85 changes: 69 additions & 16 deletions search/services/index/authors.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
"""Query-builders and helpers for searching by author name."""

from typing import Tuple, Optional, List
import re
from string import punctuation
from elasticsearch_dsl import Search, Q, SF
from .util import wildcardEscape, is_literal_query, Q_
from .util import wildcardEscape, is_literal_query, Q_, escape


STOP = ["and", "or", "the", "of", "a", "for", "an"]


# TODO: remove this when we address the author name bug in
# search.process.transform..
def _strip_punctuation(s: str) -> str:
return ''.join([c for c in s if c not in punctuation])


# TODO: revisit author name indexing in document mappings.
# Ideally stopwords would be removed at index time, but authors are indexed
# as keywords which makes that difficult.
def _remove_stopwords(term: str) -> str:
"""Remove common stopwords that will match on institutions."""
_term = str(term)
for stopword in STOP:
_term =re.sub(f"(^|\s+){stopword}(\s+|$)", " ", _term)
return _term


def _parseName(au_safe: str) -> Tuple[str, Optional[str]]:
"""Parse a name string into its (likely) constituent parts."""
# We interpret the comma as separating the surname from the forename.
if "," in au_safe:
surname, forename = au_safe.split(',')
return surname.strip(), forename.strip()
au_parts = au_safe.split(',')
if len(au_parts) >= 2:
surname = au_parts[0]
forename = au_parts[1]
return surname.strip(), forename.strip()

# Otherwise, treat the last word in the name as the surname. This isn't
# a great approach from first principles, but it produces reasonable
Expand All @@ -25,6 +50,7 @@ def _parseName(au_safe: str) -> Tuple[str, Optional[str]]:
# pieces, for readability.
def construct_author_query(term: str) -> Q:
"""Generate an author name query in the ElasticSearch DSL."""
term = escape(term)
_author_q = Q()
score_functions: List = []

Expand All @@ -35,24 +61,41 @@ def construct_author_query(term: str) -> Q:
au_name, has_wildcard = wildcardEscape(au_name)
au_safe = au_name.replace('*', '').replace('?', '').replace('"', '')
surname_safe, forename_safe = _parseName(au_safe)

if forename_safe is not None:
# TODO: remove this when the author name bug is fixed in
# search.process.transform. Since we are erroneously removing
# punctuation from author names prior to indexing, it's important
# to do the same here so that results are returned.
forename_safe = _strip_punctuation(forename_safe)

fullname_safe = f'{forename_safe} {surname_safe}'
else:
fullname_safe = surname_safe
_q = (
# Matching on keyword field is effectively an exact match.
Q('match', **{
'authors__full_name__exact': {
'query': fullname_safe, 'boost': 10
}
'query': fullname_safe, 'boost': 30
},
})

# The next best case is that the query is a substring of
# the full name.
| Q('match_phrase', **{
'authors__full_name': {'query': fullname_safe, 'boost': 9}
})
)
if not is_literal_query(term):
# Search across all authors, and prefer documents for which a
# greater number of authors respond. For this part of the search
# we want to avoid artificially high scores when only initials
# match, so we drop solo characters from the query.
term_sans_inits = ' '.join(part for part in
_remove_stopwords(term).split()
if len(part) > 1)
_q |= Q('multi_match', fields=['authors.full_name'],
query=term_sans_inits, boost=8, type="cross_fields")
# We support wildcards (?*) within each author name. Since
# ES will treat the non-wildcard part of the term as a literal,
# we need to apply each word in the name separately.
Expand Down Expand Up @@ -89,7 +132,8 @@ def construct_author_query(term: str) -> Q:
'match', **{
'authors__full_name': fullname_safe
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -99,7 +143,8 @@ def construct_author_query(term: str) -> Q:
'match', **{
'authors__full_name_initialized': au_safe
}
)
),
score_mode='sum'
)
})
]
Expand All @@ -115,7 +160,8 @@ def construct_author_query(term: str) -> Q:
'match', **{
'authors__last_name': surname_safe
}
)
),
score_mode='sum'
)
}),
]
Expand Down Expand Up @@ -151,7 +197,8 @@ def construct_author_query(term: str) -> Q:
"match", **{
"authors__first_name__exact": forename_safe
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -161,7 +208,8 @@ def construct_author_query(term: str) -> Q:
"match", **{
"authors__first_name__exact": init_forename
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -170,7 +218,8 @@ def construct_author_query(term: str) -> Q:
"match_phrase", **{
"authors__first_name": forename_safe
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -179,7 +228,8 @@ def construct_author_query(term: str) -> Q:
"match_phrase", **{
"authors__first_name": init_forename
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -188,7 +238,8 @@ def construct_author_query(term: str) -> Q:
"match", **{
"authors__first_name": forename_safe
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -197,7 +248,8 @@ def construct_author_query(term: str) -> Q:
"match", **{
"authors__first_name": init_forename
}
)
),
score_mode='sum'
)
}),
SF({
Expand All @@ -206,11 +258,12 @@ def construct_author_query(term: str) -> Q:
"match", **{
"authors__initials": init_forename.lower()
}
)
),
score_mode='sum'
)
}),
]
_author_q &= Q("nested", path="authors", query=_q)
_author_q &= Q("nested", path="authors", query=_q, score_mode='sum')

return Q('function_score', query=_author_q,
score_mode="sum", boost=1, boost_mode='multiply',
Expand Down
Loading

0 comments on commit a6c3eba

Please sign in to comment.