Skip to content

Commit

Permalink
feat: Add oauth flow for querybook github integration (pinterest#1497)
Browse files Browse the repository at this point in the history
* feat: Add oauth flow for querybook github integration
* link datadoc to github directory
* feat: Add Datadoc serializing util
  • Loading branch information
zhangvi7 committed Nov 13, 2024
1 parent ff34710 commit 4d09fab
Show file tree
Hide file tree
Showing 21 changed files with 728 additions and 0 deletions.
4 changes: 4 additions & 0 deletions querybook/config/querybook_default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ OAUTH_AUTHORIZATION_URL: ~
OAUTH_TOKEN_URL: ~
OAUTH_USER_PROFILE: ~

# --------------- GitHub Integration ---------------
GITHUB_CLIENT_ID: ~
GITHUB_CLIENT_SECRET: ~

# LDAP
LDAP_CONN: ~
LDAP_USER_DN: uid={},dc=example,dc=com
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Add GitHub Datadoc Link
Revision ID: aa328ae9dced
Revises: f7b11b3e3a95
Create Date: 2024-10-23 21:04:55.052696
"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "aa328ae9dced"
down_revision = "f7b11b3e3a95"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"github_link",
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
sa.Column("datadoc_id", sa.Integer(), nullable=False),
sa.Column("user_id", sa.Integer(), nullable=False),
sa.Column(
"directory",
sa.String(length=255),
nullable=False,
server_default="datadocs",
),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["datadoc_id"],
["data_doc.id"],
),
sa.ForeignKeyConstraint(
["user_id"],
["user.id"],
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("datadoc_id"),
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("github_link")
# ### end Alembic commands ###
3 changes: 3 additions & 0 deletions querybook/server/datasources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from . import comment
from . import survey
from . import query_transform
from . import github


# Keep this at the end of imports to make sure the plugin APIs override the default ones
try:
Expand Down Expand Up @@ -47,3 +49,4 @@
survey
query_transform
api_plugin
github
26 changes: 26 additions & 0 deletions querybook/server/datasources/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from app.datasource import register
from lib.github.github import github_manager
from typing import Dict
from logic import github as logic
from flask_login import current_user


@register("/github/auth/", methods=["GET"])
def connect_github() -> Dict[str, str]:
return github_manager.initiate_github_integration()


@register("/github/is_authenticated/", methods=["GET"])
def is_github_authenticated() -> str:
is_authenticated = github_manager.get_github_token() is not None
return {"is_authenticated": is_authenticated}


@register("/github/datadocs/<int:datadoc_id>/link/", methods=["POST"])
def link_datadoc_to_github(
datadoc_id: int,
directory: str,
) -> Dict:
return logic.create_repo_link(
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory
)
4 changes: 4 additions & 0 deletions querybook/server/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ class QuerybookSettings(object):
OAUTH_USER_PROFILE = get_env_config("OAUTH_USER_PROFILE")
AZURE_TENANT_ID = get_env_config("AZURE_TENANT_ID")

# GitHub App settings for feature integration
GITHUB_CLIENT_ID = os.getenv("GITHUB_CLIENT_ID")
GITHUB_CLIENT_SECRET = os.getenv("GITHUB_CLIENT_SECRET")

LDAP_CONN = get_env_config("LDAP_CONN")
LDAP_USE_TLS = str(get_env_config("LDAP_USE_TLS")).lower() == "true"
LDAP_USE_BIND_USER = str(get_env_config("LDAP_USE_BIND_USER")).lower() == "true"
Expand Down
Empty file.
93 changes: 93 additions & 0 deletions querybook/server/lib/github/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import certifi
from flask import session as flask_session, request
from app.auth.github_auth import GitHubLoginManager
from env import QuerybookSettings
from lib.logger import get_logger
from app.flask_app import flask_app
from typing import Optional, Dict, Any

LOG = get_logger(__file__)

GITHUB_OAUTH_CALLBACK = "/github/oauth2callback"
GITHUB_ACCESS_TOKEN = "github_access_token"


class GitHubManager(GitHubLoginManager):
def __init__(
self,
additional_scopes: Optional[list] = None,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
):
self.additional_scopes = additional_scopes or []
self._client_id = client_id
self._client_secret = client_secret
super().__init__()

@property
def oauth_config(self) -> Dict[str, Any]:
config = super().oauth_config
config["scope"] = "user email " + " ".join(self.additional_scopes)
config[
"callback_url"
] = f"{QuerybookSettings.PUBLIC_URL}{GITHUB_OAUTH_CALLBACK}"
if self._client_id:
config["client_id"] = self._client_id
if self._client_secret:
config["client_secret"] = self._client_secret
return config

def save_github_token(self, token: str) -> None:
flask_session[GITHUB_ACCESS_TOKEN] = token
LOG.debug("Saved GitHub token to session")

def get_github_token(self) -> Optional[str]:
return flask_session.get(GITHUB_ACCESS_TOKEN)

def initiate_github_integration(self) -> Dict[str, str]:
github = self.oauth_session
authorization_url, state = github.authorization_url(
self.oauth_config["authorization_url"]
)
flask_session["oauth_state"] = state
return {"url": authorization_url}

def github_integration_callback(self) -> str:
try:
github = self.oauth_session
access_token = github.fetch_token(
self.oauth_config["token_url"],
client_secret=self.oauth_config["client_secret"],
authorization_response=request.url,
cert=certifi.where(),
)
self.save_github_token(access_token["access_token"])
return self.success_response()
except Exception as e:
LOG.error(f"Failed to obtain credentials: {e}")
return self.error_response(str(e))

def success_response(self) -> str:
return """
<p>Success! Please close the tab.</p>
<script>
window.opener.receiveChildMessage()
</script>
"""

def error_response(self, error_message: str) -> str:
return f"""
<p>Failed to obtain credentials, reason: {error_message}</p>
"""


github_manager = GitHubManager(
additional_scopes=["repo"],
client_id=QuerybookSettings.GITHUB_CLIENT_ID,
client_secret=QuerybookSettings.GITHUB_CLIENT_SECRET,
)


@flask_app.route(GITHUB_OAUTH_CALLBACK)
def github_callback() -> str:
return github_manager.github_integration_callback()
145 changes: 145 additions & 0 deletions querybook/server/lib/github/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import yaml
import re
from typing import List
from models.datadoc import DataDoc, DataCell
from const.data_doc import DataCellType
from datetime import datetime, timezone


def parse_datetime_as_utc(date_str: str) -> datetime:
"""
Parse the given date string to a datetime object in UTC.
"""
if isinstance(date_str, datetime):
return date_str.astimezone(timezone.utc)
if date_str:
return datetime.fromisoformat(date_str).astimezone(timezone.utc)
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc)


def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str:
# Serialize DataDoc metadata to YAML front matter for readability
datadoc_metadata = {
"id": datadoc.id,
"environment_id": datadoc.environment_id,
"public": datadoc.public,
"archived": datadoc.archived,
"owner_uid": datadoc.owner_uid,
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None,
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None,
"meta": datadoc.meta,
"title": datadoc.title,
}
try:
front_matter = (
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n"
)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}")

title = f"# {datadoc.title}\n\n"
content = serialize_datacells(datadoc.cells)
markdown_content = front_matter + title + content
return markdown_content


def serialize_datacells(cells: List[DataCell]) -> str:
lines = []
for cell in cells:
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections,
# we serialize cell metadata in HTML comment to hide it from rendered view
cell_metadata = {
"id": cell.id,
"cell_type": cell.cell_type.name.lower(),
"created_at": cell.created_at.isoformat() if cell.created_at else None,
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None,
"meta": cell.meta,
}
try:
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing cell metadata to YAML: {e}")

cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n"

cell_content = serialize_cell_content(cell)
lines.append(cell_metadata_comment + cell_content)

return "\n\n".join(lines)


def serialize_cell_content(cell: DataCell) -> str:
cell_meta = cell.meta or {}
if cell.cell_type == DataCellType.query:
query_title = cell_meta.get("title", "Query")
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.text:
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.chart:
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n"


def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc:
front_matter, content = extract_front_matter(markdown_str)
datadoc = create_datadoc_from_metadata(front_matter)
datadoc.cells = deserialize_datadoc_content(content)
return datadoc


def extract_front_matter(markdown_str: str):
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL)
match = front_matter_pattern.match(markdown_str)
if match:
front_matter_str = match.group(1)
content = markdown_str[match.end() :]
try:
front_matter = yaml.safe_load(front_matter_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing front matter YAML: {e}")
else:
raise ValueError("Invalid Markdown format: Missing front matter.")
return front_matter, content


def create_datadoc_from_metadata(metadata: dict) -> DataDoc:
datadoc = DataDoc(
id=metadata.get("id"),
environment_id=metadata.get("environment_id"),
public=metadata.get("public", True),
archived=metadata.get("archived", False),
owner_uid=metadata.get("owner_uid"),
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
title=metadata.get("title", ""),
)
datadoc.meta = metadata.get("meta", {})
return datadoc


def deserialize_datadoc_content(content_str: str) -> List[DataCell]:
cells = []
# Pattern to match cell metadata in HTML comments and the following content
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL)
matches = pattern.finditer(content_str)
for match in matches:
metadata_str = match.group(1)
cell_content = match.group(2)
try:
metadata = yaml.safe_load(metadata_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing cell metadata YAML: {e}")

cell_type_str = metadata.get("cell_type", "query").lower()
cell_type = DataCellType[cell_type_str]
cell = DataCell(
id=metadata.get("id"),
cell_type=cell_type,
context=(
cell_content.strip() if cell_type != DataCellType.chart else None
), # Charts are generated from the metadata, and not from content
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
meta=metadata.get("meta", {}),
)
cells.append(cell)
return cells
Loading

0 comments on commit 4d09fab

Please sign in to comment.