-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
schema: Use pd types rather than python types (#298)
- Loading branch information
1 parent
7b5061e
commit 10e5ea9
Showing
22 changed files
with
716 additions
and
193 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,7 @@ class MemberActivityDataset: | |
domain: str = field(key=True) | ||
hasShortcut: bool | ||
country: str | ||
DOMAIN_USED_COUNT: int | ||
domain_used_count: int | ||
|
||
|
||
@meta(owner="[email protected]") | ||
|
@@ -53,7 +53,7 @@ class MemberDataset: | |
@dataset | ||
class MemberActivityDatasetCopy: | ||
domain: str = field(key=True) | ||
DOMAIN_USED_COUNT: int | ||
domain_used_count: int | ||
time: datetime = field(timestamp=True) | ||
url: str | ||
uid: str | ||
|
@@ -71,11 +71,11 @@ def copy(cls, ds: Dataset): | |
@featureset | ||
class DomainFeatures: | ||
domain: str = feature(id=1) | ||
DOMAIN_USED_COUNT: int = feature(id=2) | ||
domain_used_count: int = feature(id=2) | ||
|
||
@extractor(depends_on=[MemberActivityDatasetCopy]) | ||
@inputs(Query.domain) | ||
@outputs(domain, DOMAIN_USED_COUNT) | ||
@outputs(domain, domain_used_count) | ||
def get_domain_feature(cls, ts: pd.Series, domain: pd.Series): | ||
df, found = MemberActivityDatasetCopy.lookup( # type: ignore | ||
ts, domain=domain | ||
|
@@ -106,16 +106,16 @@ def test_invalid_sync(self, client): | |
@featureset | ||
class DomainFeatures2: | ||
domain: str = feature(id=1) | ||
DOMAIN_USED_COUNT: int = feature(id=2) | ||
domain_used_count: int = feature(id=2) | ||
|
||
@extractor() | ||
@inputs(Query.domain) | ||
@outputs(domain, DOMAIN_USED_COUNT) | ||
@outputs(domain, domain_used_count) | ||
def get_domain_feature(cls, ts: pd.Series, domain: pd.Series): | ||
df, found = MemberActivityDatasetCopy.lookup( # type: ignore | ||
ts, domain=domain | ||
) | ||
return df[[str(cls.domain), str(cls.DOMAIN_USED_COUNT)]] | ||
return df[[str(cls.domain), str(cls.domain_used_count)]] | ||
|
||
|
||
class TestInvalidExtractorDependsOn(unittest.TestCase): | ||
|
@@ -133,7 +133,7 @@ class MemberActivityDataset: | |
domain: str = field(key=True) | ||
hasShortcut: bool | ||
country: str | ||
DOMAIN_USED_COUNT: int | ||
domain_used_count: int | ||
|
||
@meta(owner="[email protected]") | ||
@source(webhook.endpoint("MemberDataset")) | ||
|
@@ -150,7 +150,7 @@ class MemberDataset: | |
@dataset | ||
class MemberActivityDatasetCopy: | ||
domain: str = field(key=True) | ||
DOMAIN_USED_COUNT: int | ||
domain_used_count: int | ||
time: datetime = field(timestamp=True) | ||
url: str | ||
uid: str | ||
|
@@ -167,11 +167,11 @@ def copy(cls, ds: Dataset): | |
@featureset | ||
class DomainFeatures: | ||
domain: str = feature(id=1) | ||
DOMAIN_USED_COUNT: int = feature(id=2) | ||
domain_used_count: int = feature(id=2) | ||
|
||
@extractor(depends_on=[MemberActivityDatasetCopy]) | ||
@inputs(Query.domain) | ||
@outputs(domain, DOMAIN_USED_COUNT) | ||
@outputs(domain, domain_used_count) | ||
def get_domain_feature(cls, ts: pd.Series, domain: pd.Series): | ||
df, found = MemberActivityDatasetCopy.lookup( # type: ignore | ||
ts, domain=domain | ||
|
@@ -188,7 +188,7 @@ def get_domain_feature(cls, ts: pd.Series, domain: pd.Series): | |
) | ||
client.extract_features( | ||
output_feature_list=[DomainFeatures2], | ||
input_feature_list=[Query], | ||
input_feature_list=[Query.member_id], | ||
input_dataframe=pd.DataFrame( | ||
{ | ||
"Query.domain": [ | ||
|
@@ -262,9 +262,8 @@ def test_no_access(self, client): | |
) | ||
else: | ||
assert ( | ||
"Extractor `get_domain_feature` in `DomainFeatures2` " | ||
"failed to run with error: name " | ||
"'MemberActivityDatasetCopy' is not defined. " == str(e.value) | ||
str(e.value) | ||
== """Dataset `MemberActivityDataset` is an input to the pipelines: `['copy']` but is not synced. Please add it to the sync call.""" | ||
) | ||
|
||
@mock | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,16 +8,15 @@ | |
from fennel import featureset, extractor, feature | ||
from fennel.datasets import dataset, field | ||
from fennel.lib.metadata import meta | ||
from fennel.lib.schema import inputs, outputs | ||
from fennel.lib.schema import inputs, outputs, between | ||
from fennel.sources import source | ||
from fennel.datasets import pipeline, Dataset | ||
from fennel.lib.aggregate import Sum | ||
from fennel.lib.aggregate import Sum, LastK, Distinct | ||
from fennel.lib.window import Window | ||
from fennel.sources import Webhook | ||
from fennel.test_lib import mock, MockClient | ||
|
||
from typing import List | ||
|
||
from typing import List, Optional | ||
|
||
client = MockClient() | ||
|
||
|
@@ -29,7 +28,7 @@ | |
@dataset | ||
class MovieInfo: | ||
title: str = field(key=True) | ||
actors: List[str] # can be an empty list | ||
actors: List[Optional[str]] # can be an empty list | ||
release: datetime | ||
|
||
|
||
|
@@ -39,15 +38,15 @@ class MovieInfo: | |
class TicketSale: | ||
ticket_id: str | ||
title: str | ||
price: int | ||
price: int # type: ignore | ||
at: datetime | ||
|
||
|
||
@meta(owner="[email protected]") | ||
@dataset | ||
class ActorStats: | ||
name: str = field(key=True) | ||
revenue: int | ||
revenue: int # type: ignore | ||
at: datetime | ||
|
||
@pipeline(version=1, tier="prod") | ||
|
@@ -101,6 +100,46 @@ def foo(df): | |
) | ||
|
||
|
||
@meta(owner="[email protected]") | ||
@dataset | ||
class ActorStatsList: | ||
name: str = field(key=True) | ||
revenue: List[int] # type: ignore | ||
revenue_distinct: List[int] # type: ignore | ||
at: datetime | ||
|
||
@pipeline(version=1, tier="prod") | ||
@inputs(MovieInfo, TicketSale) | ||
def pipeline_join(cls, info: Dataset, sale: Dataset): | ||
uniq = sale.groupby("ticket_id").first() | ||
c = ( | ||
uniq.join(info, how="inner", on=["title"]) | ||
.explode(columns=["actors"]) | ||
.rename(columns={"actors": "name"}) | ||
) | ||
# name -> Option[str] | ||
schema = c.schema() | ||
schema["name"] = str | ||
c = c.transform(lambda x: x, schema) | ||
return c.groupby("name").aggregate( | ||
[ | ||
LastK( | ||
window=Window("forever"), | ||
of="price", | ||
into_field="revenue", | ||
limit=10, | ||
dedup=False, | ||
), | ||
Distinct( | ||
window=Window("forever"), | ||
of="price", | ||
into_field="revenue_distinct", | ||
unordered=True, | ||
), | ||
] | ||
) | ||
|
||
|
||
@meta(owner="[email protected]") | ||
@featureset | ||
class RequestFeatures: | ||
|
@@ -132,7 +171,7 @@ def extract_revenue2(cls, ts: pd.Series, name: pd.Series): | |
class TestMovieTicketSale(unittest.TestCase): | ||
@mock | ||
def test_movie_ticket_sale(self, client): | ||
datasets = [MovieInfo, TicketSale, ActorStats] # type: ignore | ||
datasets = [MovieInfo, TicketSale, ActorStats, ActorStatsList] # type: ignore | ||
featuresets = [ActorFeatures, RequestFeatures] | ||
client.sync(datasets=datasets, featuresets=featuresets, tier="prod") # type: ignore | ||
client.sleep() | ||
|
@@ -166,10 +205,10 @@ def test_movie_ticket_sale(self, client): | |
two_hours_ago = now - timedelta(hours=2) | ||
columns = ["ticket_id", "title", "price", "at"] | ||
data = [ | ||
["1", "Titanic", 50, one_hour_ago], | ||
["2", "Titanic", 100, one_day_ago], | ||
["3", "Jumanji", 25, one_hour_ago], | ||
["4", "The Matrix", 50, two_hours_ago], # no match | ||
["1", "Titanic", "50", one_hour_ago], | ||
["2", "Titanic", "100", one_day_ago], | ||
["3", "Jumanji", "25", one_hour_ago], | ||
["4", "The Matrix", "50", two_hours_ago], # no match | ||
["5", "Great Gatbsy", 49, one_hour_ago], | ||
] | ||
df = pd.DataFrame(data, columns=columns) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,9 @@ | |
@source( | ||
s3.bucket("fennel-demo-data", prefix="outbrain/page_views_filter.csv"), | ||
every="1d", | ||
tier="prod", | ||
) | ||
@source(webhook.endpoint("PageViews"), tier="dev") | ||
@meta(owner="[email protected]") | ||
@dataset | ||
class PageViews: | ||
|
@@ -101,16 +103,16 @@ def extract(cls, ts: pd.Series, uuids: pd.Series): | |
@pytest.mark.integration | ||
@mock | ||
def test_outbrain(client): | ||
fake_PageViews = PageViews.with_source(webhook.endpoint("PageViews")) | ||
client.sync( | ||
datasets=[ | ||
fake_PageViews, | ||
PageViews, | ||
PageViewsByUser, | ||
], | ||
featuresets=[ | ||
Request, | ||
UserPageViewFeatures, | ||
], | ||
tier="dev", | ||
) | ||
df = pd.read_csv("fennel/client_tests/data/page_views_sample.csv") | ||
# Current time in ms | ||
|
@@ -143,7 +145,7 @@ def test_outbrain(client): | |
Request, | ||
UserPageViewFeatures, | ||
], | ||
input_feature_list=[Request], | ||
input_feature_list=[Request.uuid, Request.document_id], | ||
input_dataframe=input_df, | ||
) | ||
assert feature_df.shape[0] == 347 | ||
|
Oops, something went wrong.