forked from pinterest/querybook
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add oauth flow for querybook github integration (pinterest#1497)
* feat: Add oauth flow for querybook github integration * link datadoc to github directory * feat: Add Datadoc serializing util
- Loading branch information
Showing
21 changed files
with
728 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
"""Add GitHub Datadoc Link | ||
Revision ID: aa328ae9dced | ||
Revises: f7b11b3e3a95 | ||
Create Date: 2024-10-23 21:04:55.052696 | ||
""" | ||
|
||
from alembic import op | ||
import sqlalchemy as sa | ||
|
||
|
||
# revision identifiers, used by Alembic. | ||
revision = "aa328ae9dced" | ||
down_revision = "f7b11b3e3a95" | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade(): | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.create_table( | ||
"github_link", | ||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), | ||
sa.Column("datadoc_id", sa.Integer(), nullable=False), | ||
sa.Column("user_id", sa.Integer(), nullable=False), | ||
sa.Column( | ||
"directory", | ||
sa.String(length=255), | ||
nullable=False, | ||
server_default="datadocs", | ||
), | ||
sa.Column( | ||
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False | ||
), | ||
sa.Column( | ||
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False | ||
), | ||
sa.ForeignKeyConstraint( | ||
["datadoc_id"], | ||
["data_doc.id"], | ||
), | ||
sa.ForeignKeyConstraint( | ||
["user_id"], | ||
["user.id"], | ||
), | ||
sa.PrimaryKeyConstraint("id"), | ||
sa.UniqueConstraint("datadoc_id"), | ||
) | ||
# ### end Alembic commands ### | ||
|
||
|
||
def downgrade(): | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.drop_table("github_link") | ||
# ### end Alembic commands ### |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from app.datasource import register | ||
from lib.github.github import github_manager | ||
from typing import Dict | ||
from logic import github as logic | ||
from flask_login import current_user | ||
|
||
|
||
@register("/github/auth/", methods=["GET"]) | ||
def connect_github() -> Dict[str, str]: | ||
return github_manager.initiate_github_integration() | ||
|
||
|
||
@register("/github/is_authenticated/", methods=["GET"]) | ||
def is_github_authenticated() -> str: | ||
is_authenticated = github_manager.get_github_token() is not None | ||
return {"is_authenticated": is_authenticated} | ||
|
||
|
||
@register("/github/datadocs/<int:datadoc_id>/link/", methods=["POST"]) | ||
def link_datadoc_to_github( | ||
datadoc_id: int, | ||
directory: str, | ||
) -> Dict: | ||
return logic.create_repo_link( | ||
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import certifi | ||
from flask import session as flask_session, request | ||
from app.auth.github_auth import GitHubLoginManager | ||
from env import QuerybookSettings | ||
from lib.logger import get_logger | ||
from app.flask_app import flask_app | ||
from typing import Optional, Dict, Any | ||
|
||
LOG = get_logger(__file__) | ||
|
||
GITHUB_OAUTH_CALLBACK = "/github/oauth2callback" | ||
GITHUB_ACCESS_TOKEN = "github_access_token" | ||
|
||
|
||
class GitHubManager(GitHubLoginManager): | ||
def __init__( | ||
self, | ||
additional_scopes: Optional[list] = None, | ||
client_id: Optional[str] = None, | ||
client_secret: Optional[str] = None, | ||
): | ||
self.additional_scopes = additional_scopes or [] | ||
self._client_id = client_id | ||
self._client_secret = client_secret | ||
super().__init__() | ||
|
||
@property | ||
def oauth_config(self) -> Dict[str, Any]: | ||
config = super().oauth_config | ||
config["scope"] = "user email " + " ".join(self.additional_scopes) | ||
config[ | ||
"callback_url" | ||
] = f"{QuerybookSettings.PUBLIC_URL}{GITHUB_OAUTH_CALLBACK}" | ||
if self._client_id: | ||
config["client_id"] = self._client_id | ||
if self._client_secret: | ||
config["client_secret"] = self._client_secret | ||
return config | ||
|
||
def save_github_token(self, token: str) -> None: | ||
flask_session[GITHUB_ACCESS_TOKEN] = token | ||
LOG.debug("Saved GitHub token to session") | ||
|
||
def get_github_token(self) -> Optional[str]: | ||
return flask_session.get(GITHUB_ACCESS_TOKEN) | ||
|
||
def initiate_github_integration(self) -> Dict[str, str]: | ||
github = self.oauth_session | ||
authorization_url, state = github.authorization_url( | ||
self.oauth_config["authorization_url"] | ||
) | ||
flask_session["oauth_state"] = state | ||
return {"url": authorization_url} | ||
|
||
def github_integration_callback(self) -> str: | ||
try: | ||
github = self.oauth_session | ||
access_token = github.fetch_token( | ||
self.oauth_config["token_url"], | ||
client_secret=self.oauth_config["client_secret"], | ||
authorization_response=request.url, | ||
cert=certifi.where(), | ||
) | ||
self.save_github_token(access_token["access_token"]) | ||
return self.success_response() | ||
except Exception as e: | ||
LOG.error(f"Failed to obtain credentials: {e}") | ||
return self.error_response(str(e)) | ||
|
||
def success_response(self) -> str: | ||
return """ | ||
<p>Success! Please close the tab.</p> | ||
<script> | ||
window.opener.receiveChildMessage() | ||
</script> | ||
""" | ||
|
||
def error_response(self, error_message: str) -> str: | ||
return f""" | ||
<p>Failed to obtain credentials, reason: {error_message}</p> | ||
""" | ||
|
||
|
||
github_manager = GitHubManager( | ||
additional_scopes=["repo"], | ||
client_id=QuerybookSettings.GITHUB_CLIENT_ID, | ||
client_secret=QuerybookSettings.GITHUB_CLIENT_SECRET, | ||
) | ||
|
||
|
||
@flask_app.route(GITHUB_OAUTH_CALLBACK) | ||
def github_callback() -> str: | ||
return github_manager.github_integration_callback() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import yaml | ||
import re | ||
from typing import List | ||
from models.datadoc import DataDoc, DataCell | ||
from const.data_doc import DataCellType | ||
from datetime import datetime, timezone | ||
|
||
|
||
def parse_datetime_as_utc(date_str: str) -> datetime: | ||
""" | ||
Parse the given date string to a datetime object in UTC. | ||
""" | ||
if isinstance(date_str, datetime): | ||
return date_str.astimezone(timezone.utc) | ||
if date_str: | ||
return datetime.fromisoformat(date_str).astimezone(timezone.utc) | ||
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc) | ||
|
||
|
||
def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str: | ||
# Serialize DataDoc metadata to YAML front matter for readability | ||
datadoc_metadata = { | ||
"id": datadoc.id, | ||
"environment_id": datadoc.environment_id, | ||
"public": datadoc.public, | ||
"archived": datadoc.archived, | ||
"owner_uid": datadoc.owner_uid, | ||
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None, | ||
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None, | ||
"meta": datadoc.meta, | ||
"title": datadoc.title, | ||
} | ||
try: | ||
front_matter = ( | ||
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n" | ||
) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}") | ||
|
||
title = f"# {datadoc.title}\n\n" | ||
content = serialize_datacells(datadoc.cells) | ||
markdown_content = front_matter + title + content | ||
return markdown_content | ||
|
||
|
||
def serialize_datacells(cells: List[DataCell]) -> str: | ||
lines = [] | ||
for cell in cells: | ||
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections, | ||
# we serialize cell metadata in HTML comment to hide it from rendered view | ||
cell_metadata = { | ||
"id": cell.id, | ||
"cell_type": cell.cell_type.name.lower(), | ||
"created_at": cell.created_at.isoformat() if cell.created_at else None, | ||
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None, | ||
"meta": cell.meta, | ||
} | ||
try: | ||
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error serializing cell metadata to YAML: {e}") | ||
|
||
cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n" | ||
|
||
cell_content = serialize_cell_content(cell) | ||
lines.append(cell_metadata_comment + cell_content) | ||
|
||
return "\n\n".join(lines) | ||
|
||
|
||
def serialize_cell_content(cell: DataCell) -> str: | ||
cell_meta = cell.meta or {} | ||
if cell.cell_type == DataCellType.query: | ||
query_title = cell_meta.get("title", "Query") | ||
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" | ||
elif cell.cell_type == DataCellType.text: | ||
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n" | ||
elif cell.cell_type == DataCellType.chart: | ||
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n" | ||
|
||
|
||
def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: | ||
front_matter, content = extract_front_matter(markdown_str) | ||
datadoc = create_datadoc_from_metadata(front_matter) | ||
datadoc.cells = deserialize_datadoc_content(content) | ||
return datadoc | ||
|
||
|
||
def extract_front_matter(markdown_str: str): | ||
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL) | ||
match = front_matter_pattern.match(markdown_str) | ||
if match: | ||
front_matter_str = match.group(1) | ||
content = markdown_str[match.end() :] | ||
try: | ||
front_matter = yaml.safe_load(front_matter_str) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error parsing front matter YAML: {e}") | ||
else: | ||
raise ValueError("Invalid Markdown format: Missing front matter.") | ||
return front_matter, content | ||
|
||
|
||
def create_datadoc_from_metadata(metadata: dict) -> DataDoc: | ||
datadoc = DataDoc( | ||
id=metadata.get("id"), | ||
environment_id=metadata.get("environment_id"), | ||
public=metadata.get("public", True), | ||
archived=metadata.get("archived", False), | ||
owner_uid=metadata.get("owner_uid"), | ||
created_at=parse_datetime_as_utc(metadata.get("created_at")), | ||
updated_at=parse_datetime_as_utc(metadata.get("updated_at")), | ||
title=metadata.get("title", ""), | ||
) | ||
datadoc.meta = metadata.get("meta", {}) | ||
return datadoc | ||
|
||
|
||
def deserialize_datadoc_content(content_str: str) -> List[DataCell]: | ||
cells = [] | ||
# Pattern to match cell metadata in HTML comments and the following content | ||
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL) | ||
matches = pattern.finditer(content_str) | ||
for match in matches: | ||
metadata_str = match.group(1) | ||
cell_content = match.group(2) | ||
try: | ||
metadata = yaml.safe_load(metadata_str) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error parsing cell metadata YAML: {e}") | ||
|
||
cell_type_str = metadata.get("cell_type", "query").lower() | ||
cell_type = DataCellType[cell_type_str] | ||
cell = DataCell( | ||
id=metadata.get("id"), | ||
cell_type=cell_type, | ||
context=( | ||
cell_content.strip() if cell_type != DataCellType.chart else None | ||
), # Charts are generated from the metadata, and not from content | ||
created_at=parse_datetime_as_utc(metadata.get("created_at")), | ||
updated_at=parse_datetime_as_utc(metadata.get("updated_at")), | ||
meta=metadata.get("meta", {}), | ||
) | ||
cells.append(cell) | ||
return cells |
Oops, something went wrong.