Skip to content

Commit

Permalink
test suite: parsing tests + system refactor
Browse files Browse the repository at this point in the history
The whole parsing system is refactored for more clarity and easier
testing. Thanks to that we can test individual steps of the process.
This change includes un-privating some API methods.
Additionally, during this step of the development, the reason for
bug #30 has been discovered, and the responsible function fixed.
  • Loading branch information
Noiredd committed Feb 1, 2020
1 parent 33dbf75 commit aac15e9
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 44 deletions.
103 changes: 61 additions & 42 deletions filmatyk/filmweb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json

from bs4 import BeautifulSoup as BS
import html
import requests_html

import containers
Expand All @@ -11,10 +12,13 @@ class Constants(object):
login_path = 'https://ssl.filmweb.pl/j_login'
base_path = 'https://www.filmweb.pl'
auth_error = 'błędny e-mail lub hasło' #TODO: be a bit more intelligent here
main_class = 'userVotesPage__results'
item_class = 'userVotesPage__result'
m_cnt_span = 'blockHeader__titleInfoCount'
s_cnt_span = 'blockHeader__titleInfoCount'
g_cnt_span = 'blockHeader__titleInfoCount'
rating_source = 'userVotes'
rating_stype = 'application/json'
m_count_span = 'blockHeader__titleInfoCount'
s_count_span = 'blockHeader__titleInfoCount'
g_count_span = 'blockHeader__titleInfoCount'
@classmethod
def getUserPage(self, username):
return self.base_path + '/user/' + username
Expand Down Expand Up @@ -78,9 +82,9 @@ def __init__(self, callback, username:str=''):
'Game': self.Constants.getUserGamePage
}
self.countSpanClasses = {
'Movie': self.Constants.m_cnt_span,
'Series': self.Constants.s_cnt_span,
'Game': self.Constants.g_cnt_span
'Movie': self.Constants.m_count_span,
'Series': self.Constants.s_count_span,
'Game': self.Constants.g_count_span
}

def __cacheAllParsingRules(self):
Expand Down Expand Up @@ -150,7 +154,7 @@ def getNumOf(self, itemtype:str):
except KeyError:
return 0, 0 # should never happen though
url = getURL(self.username)
page = self.__fetchPage(url)
page = self.fetchPage(url)
# TODO: in principle, this page could be cached for some small time
#the number of user's movies is inside a span of a specific class
items = 0
Expand Down Expand Up @@ -180,12 +184,12 @@ def getItemsPage(self, itemtype:str, page:int=1):
except KeyError:
return [] # should never happen though
url = getURL(self.username, page)
page = self.__fetchPage(url)
data = self.__parsePage(page, itemtype)
page = self.fetchPage(url)
data = self.parsePage(page, itemtype)
return data

def __fetchPage(self, url):
#fetch the page and return its parsed representation
def fetchPage(self, url):
"""Fetch the page and return its BeautifulSoup representation."""
try:
page = self.session.get(url)
except:
Expand All @@ -197,32 +201,40 @@ def __fetchPage(self, url):
else:
return BS(page.html.html, 'lxml')

def __parsePage(self, page, itemtype:str):
parsed = []
#find all voting divs with the item details (that will be parsed)
for div in page.body.find_all('div'):
if not div.has_attr('data-id') or not div.has_attr('class'):
continue
if not self.Constants.item_class in div.attrs['class']:
continue
#parse each single item (constructs an item object)
parsed.append(self.__parseOne(div, itemtype))
#ratings are stored elsewhere, but fortunately they are just JSONs
for span in page.body.find_all('span'):
if not span.has_attr('id'):
continue
span_id = span.attrs['id']
for p in span.parents:
if p.has_attr('data-source') and 'userVotes' in p.attrs['data-source']:
#get a formatted dict from the JSON and ID of the item it belongs to
rating, id = self.__parseRating(span.text)
#among the parsed items, find one with matching ID and attach
for item in parsed:
if item.properties['id'] == id:
item.addRating(rating)
return parsed
def parsePage(self, page, itemtype:str):
"""Parse items and ratings, returning constructed Item objects."""
data_div = self.extractDataSource(page)
sub_divs = self.extractItems(data_div)
parsed_items = [self.parseOne(div, itemtype) for div in sub_divs]
ratings = [self.parseRating(txt) for txt in self.extractRatings(data_div)]
for rating, iid in ratings:
for item in parsed_items:
if item.getRawProperty('id') == iid:
item.addRating(rating)
return parsed_items

def extractDataSource(self, page):
"""Extract the div that holds all the data."""
return page.find('div', attrs={'class': self.Constants.main_class})

def extractItems(self, div):
"""From the main div, extract all divs holding item details."""
sub_divs = div.find_all('div', attrs={'class': self.Constants.item_class})
sub_divs = [div for div in sub_divs if div.has_attr('data-id')]
return sub_divs

def __parseOne(self, div, itemtype:str):
def extractRatings(self, div):
"""From the main div, extract all item ratings.
They're held in a specific span as <script> contents.
"""
span = div.find('span', attrs={'data-source': self.Constants.rating_source})
scripts = span.find_all('script', attrs={'type': self.Constants.rating_stype})
ratings = [script.getText() for script in scripts]
return ratings

def parseOne(self, div, itemtype:str):
"""Parse a single item, constructing its container representation."""
#first, gather all results in a dict
parsed = {'id': int(div.attrs['data-id'])}
#then, select the right set of parsing rules
Expand Down Expand Up @@ -257,8 +269,13 @@ def __parseOne(self, div, itemtype:str):
constructObject = containers.classByString[itemtype]
return constructObject(**parsed)

def __parseRating(self, text):
#FW stores the ratings as simple dict serialized to JSON
def parseRating(self, text):
"""Parse the rating information into compatible dict.
FW stores the ratings as simple dict serialized to JSON, this only ensures
all the entries are present and translates them to a standard expected by
Item's addRating method.
"""
origDict = json.loads(text)
#ensure all date keys are present
try:
Expand All @@ -270,13 +287,15 @@ def __parseRating(self, text):
date_['m'] = 1
if 'd' not in date_.keys():
date_['d'] = 1
#translate that dict to more readable standard
id = origDict['eId']
# unescape HTML-coded characters from the comment
comment = html.unescape(origDict['c'] if 'c' in origDict.keys() else '')
# translate that dict to more readable standard
iid = origDict['eId']
isFaved = origDict['f'] if 'f' in origDict.keys() else 0
ratingDict = {
'rating': int(origDict['r']),
'comment': origDict['c'] if 'c' in origDict.keys() else '',
'comment': comment,
'dateOf': date_,
'faved': isFaved
}
return ratingDict, id
return ratingDict, iid
10 changes: 10 additions & 0 deletions test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,13 @@ However, it is **required** to execute this test at least once -
otherwise other tests will error out.
To do this:
`cd test && python test_api.py all`

#### Detailed offline test: `TestAPIParsing`

Test class `TestAPIParsing` performs step-by-step tests of the parsing mechanism.
Items extracted from the raw HTML data are in fact separate -
details of rated items (movies etc.) are stored in one place,
while their ratings are elsewhere.
Tests are done sequentially, from locating the sources for data,
through parsing a single entity, to parsing a complete page.

55 changes: 53 additions & 2 deletions test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def test_02_fetch_one(self):
self.api.username,
page=1,
)
page = self.api._FilmwebAPI__fetchPage(url)
page = self.api.fetchPage(url)
text = page.prettify()
self.assertIsInstance(text, str)
self.assertGreater(len(text), 100 * 2 ** 10)
Expand All @@ -81,7 +81,7 @@ def test_03_fetch_save(self):
N_PAGES = 3
for i in range(N_PAGES):
url = self.api.Constants.getUserMoviePage(self.api.username, page=i+1)
page = self.api._FilmwebAPI__fetchPage(url)
page = self.api.fetchPage(url)
path = os.path.join('assets', 'movies_{}.html'.format(i+1))
with open(path, 'w', encoding='utf-8') as html:
text = page.prettify()
Expand All @@ -91,6 +91,57 @@ def test_03_fetch_save(self):
self.assertIn('movies_{}.html'.format(i+1), os.listdir('assets'))


class TestAPIParsing(unittest.TestCase):
"""Test API parsing functionalities.
Starts with extraction of main data region - a div that holds details of all
items and ratings. Then tests parsing of individual items, finally of a whole
page.
"""

@classmethod
def setUpClass(self):
self.api = filmweb.FilmwebAPI(None)
self.page = None
with open(os.path.join('assets', 'movies_1.html'), 'r', encoding='utf-8') as html:
self.page = BS(html.read(), 'lxml')

def test_01_data_source_extract(self):
"""Find the main div containing details of rated objects."""
div = self.api.extractDataSource(self.page)
self.assertIsNotNone(div)
self.assertGreater(len(div.getText()), 10**4)

def test_02_item_divs_extract(self):
"""Retrieve all the item detail divs."""
div = self.api.extractDataSource(self.page)
items = self.api.extractItems(div)
self.assertGreater(len(items), 0)

def test_03_item_ratings_extract(self):
"""Retrieve all the item rating strings."""
div = self.api.extractDataSource(self.page)
items = self.api.extractItems(div)
ratings = self.api.extractRatings(div)
self.assertEqual(len(items), len(ratings))

def test_04_single_parsing(self):
"""Parse a single item and rating."""
div = self.api.extractDataSource(self.page)
items = self.api.extractItems(div)
item = self.api.parseOne(items[0], 'Movie')
self.assertGreater(len(item['title']), 2)
ratings = self.api.extractRatings(div)
rating, rid = self.api.parseRating(ratings[0])
self.assertIn('rating', rating.keys())
self.assertEqual(rid, item.getRawProperty('id'))

def test_10_parse_page(self):
"""Parse an entire page of movies."""
items = self.api.parsePage(self.page, 'Movie')
self.assertGreater(len(items), 0)


if __name__ == "__main__":
try:
TestAPIBasics.noTests = (sys.argv[1] != 'all')
Expand Down

0 comments on commit aac15e9

Please sign in to comment.