From 5a6db84142bc585125e3b716c4e4717c787ee464 Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 16 Oct 2024 16:10:48 +0000 Subject: [PATCH 1/7] feat: Add oauth flow for querybook github integration --- querybook/server/datasources/__init__.py | 3 + querybook/server/datasources/github.py | 16 +++ .../server/lib/github_integration/__init__.py | 0 .../github_integration/github_integration.py | 110 ++++++++++++++++++ .../DataDocGitHub/DataDocGitHubButton.tsx | 61 ++++++++++ .../components/DataDocGitHub/GitHub.scss | 8 ++ .../components/DataDocGitHub/GitHubAuth.tsx | 31 +++++ .../components/DataDocGitHub/GitHubModal.tsx | 82 +++++++++++++ .../DataDocRightSidebar.tsx | 4 + querybook/webapp/const/analytics.ts | 4 + querybook/webapp/resource/github.ts | 11 ++ querybook/webapp/ui/Icon/LucideIcons.ts | 2 + webpack.config.js | 4 + 13 files changed, 336 insertions(+) create mode 100644 querybook/server/datasources/github.py create mode 100644 querybook/server/lib/github_integration/__init__.py create mode 100644 querybook/server/lib/github_integration/github_integration.py create mode 100644 querybook/webapp/components/DataDocGitHub/DataDocGitHubButton.tsx create mode 100644 querybook/webapp/components/DataDocGitHub/GitHub.scss create mode 100644 querybook/webapp/components/DataDocGitHub/GitHubAuth.tsx create mode 100644 querybook/webapp/components/DataDocGitHub/GitHubModal.tsx create mode 100644 querybook/webapp/resource/github.ts diff --git a/querybook/server/datasources/__init__.py b/querybook/server/datasources/__init__.py index 0ba28d65f..bad09b4ef 100644 --- a/querybook/server/datasources/__init__.py +++ b/querybook/server/datasources/__init__.py @@ -18,6 +18,8 @@ from . import comment from . import survey from . import query_transform +from . import github + # Keep this at the end of imports to make sure the plugin APIs override the default ones try: @@ -47,3 +49,4 @@ survey query_transform api_plugin +github diff --git a/querybook/server/datasources/github.py b/querybook/server/datasources/github.py new file mode 100644 index 000000000..82420604c --- /dev/null +++ b/querybook/server/datasources/github.py @@ -0,0 +1,16 @@ +from app.datasource import register +from lib.github_integration.github_integration import get_github_manager +from typing import Dict + + +@register("/github/auth/", methods=["GET"]) +def connect_github() -> Dict[str, str]: + github_manager = get_github_manager() + return github_manager.initiate_github_integration() + + +@register("/github/is_authenticated/", methods=["GET"]) +def is_github_authenticated() -> str: + github_manager = get_github_manager() + is_authenticated = github_manager.get_github_token() is not None + return {"is_authenticated": is_authenticated} diff --git a/querybook/server/lib/github_integration/__init__.py b/querybook/server/lib/github_integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/querybook/server/lib/github_integration/github_integration.py b/querybook/server/lib/github_integration/github_integration.py new file mode 100644 index 000000000..4cced5956 --- /dev/null +++ b/querybook/server/lib/github_integration/github_integration.py @@ -0,0 +1,110 @@ +import certifi +from flask import session as flask_session, request +from app.auth.github_auth import GitHubLoginManager +from env import QuerybookSettings +from lib.logger import get_logger +from app.flask_app import flask_app +from typing import Optional, Dict, Any + +LOG = get_logger(__file__) + + +GITHUB_OAUTH_CALLBACK = "/github/oauth2callback" + + +class GitHubIntegrationManager(GitHubLoginManager): + def __init__(self, additional_scopes: Optional[list] = None): + self.additional_scopes = additional_scopes or [] + super().__init__() + + @property + def oauth_config(self) -> Dict[str, Any]: + config = super().oauth_config + config["scope"] = "user email " + " ".join(self.additional_scopes) + config[ + "callback_url" + ] = f"{QuerybookSettings.PUBLIC_URL}{GITHUB_OAUTH_CALLBACK}" + return config + + def save_github_token(self, token: str) -> None: + flask_session["github_access_token"] = token + LOG.debug("Saved GitHub token to session") + + def get_github_token(self) -> Optional[str]: + return flask_session.get("github_access_token") + + def initiate_github_integration(self) -> Dict[str, str]: + github = self.oauth_session + authorization_url, state = github.authorization_url( + self.oauth_config["authorization_url"] + ) + flask_session["oauth_state"] = state + return {"url": authorization_url} + + def github_integration_callback(self) -> str: + try: + github = self.oauth_session + access_token = github.fetch_token( + self.oauth_config["token_url"], + client_secret=self.oauth_config["client_secret"], + authorization_response=request.url, + cert=certifi.where(), + ) + self.save_github_token(access_token["access_token"]) + return self.success_response() + except Exception as e: + LOG.error(f"Failed to obtain credentials: {e}") + return self.error_response(str(e)) + + def success_response(self) -> str: + return """ +

Success! Please close the tab.

+ + """ + + def error_response(self, error_message: str) -> str: + return f""" +

Failed to obtain credentials, reason: {error_message}

+ """ + + +def get_github_manager() -> GitHubIntegrationManager: + return GitHubIntegrationManager(additional_scopes=["repo"]) + + +@flask_app.route(GITHUB_OAUTH_CALLBACK) +def github_callback() -> str: + github_manager = get_github_manager() + return github_manager.github_integration_callback() + + +# Test GitHub OAuth Flow +def main(): + github_manager = GitHubIntegrationManager() + oauth_config = github_manager.oauth_config + client_id = oauth_config["client_id"] + client_secret = oauth_config["client_secret"] + + from requests_oauthlib import OAuth2Session + + github = OAuth2Session(client_id) + authorization_url, state = github.authorization_url( + oauth_config["authorization_url"] + ) + print("Please go here and authorize,", authorization_url) + + redirect_response = input("Paste the full redirect URL here:") + github.fetch_token( + oauth_config["token_url"], + client_secret=client_secret, + authorization_response=redirect_response, + ) + + user_profile = github.get(oauth_config["profile_url"]).json() + print(user_profile) + + +if __name__ == "__main__": + main() diff --git a/querybook/webapp/components/DataDocGitHub/DataDocGitHubButton.tsx b/querybook/webapp/components/DataDocGitHub/DataDocGitHubButton.tsx new file mode 100644 index 000000000..a34f4626e --- /dev/null +++ b/querybook/webapp/components/DataDocGitHub/DataDocGitHubButton.tsx @@ -0,0 +1,61 @@ +import React, { useCallback, useEffect, useState } from 'react'; + +import { GitHubResource } from 'resource/github'; +import { IconButton } from 'ui/Button/IconButton'; + +import { GitHubModal } from './GitHubModal'; + +interface IProps { + docId: number; +} + +export const DataDocGitHubButton: React.FunctionComponent = ({ + docId, +}) => { + const [isModalOpen, setIsModalOpen] = useState(false); + const [isAuthenticated, setIsAuthenticated] = useState(false); + + useEffect(() => { + const checkAuthentication = async () => { + try { + const { data } = await GitHubResource.isAuthenticated(); + setIsAuthenticated(data.is_authenticated); + } catch (error) { + console.error( + 'Failed to check GitHub authentication status:', + error + ); + } + }; + + checkAuthentication(); + }, []); + + const handleOpenModal = useCallback(() => { + setIsModalOpen(true); + }, []); + + const handleCloseModal = useCallback(() => { + setIsModalOpen(false); + }, []); + + return ( + <> + + {isModalOpen && ( + + )} + + ); +}; diff --git a/querybook/webapp/components/DataDocGitHub/GitHub.scss b/querybook/webapp/components/DataDocGitHub/GitHub.scss new file mode 100644 index 000000000..f7b911929 --- /dev/null +++ b/querybook/webapp/components/DataDocGitHub/GitHub.scss @@ -0,0 +1,8 @@ +.GitHubAuth { + text-align: center; + padding: 20px; +} + +.GitHubAuth-icon { + margin-bottom: 20px; +} diff --git a/querybook/webapp/components/DataDocGitHub/GitHubAuth.tsx b/querybook/webapp/components/DataDocGitHub/GitHubAuth.tsx new file mode 100644 index 000000000..0f3caebe6 --- /dev/null +++ b/querybook/webapp/components/DataDocGitHub/GitHubAuth.tsx @@ -0,0 +1,31 @@ +import React from 'react'; + +import { Button } from 'ui/Button/Button'; +import { Icon } from 'ui/Icon/Icon'; +import { Message } from 'ui/Message/Message'; + +import './GitHub.scss'; + +interface IProps { + onAuthenticate: () => void; +} + +export const GitHubAuth: React.FunctionComponent = ({ + onAuthenticate, +}) => ( +
+ + +
+); diff --git a/querybook/webapp/components/DataDocGitHub/GitHubModal.tsx b/querybook/webapp/components/DataDocGitHub/GitHubModal.tsx new file mode 100644 index 000000000..ac84b0240 --- /dev/null +++ b/querybook/webapp/components/DataDocGitHub/GitHubModal.tsx @@ -0,0 +1,82 @@ +import React, { useCallback, useState } from 'react'; + +import { ComponentType, ElementType } from 'const/analytics'; +import { trackClick } from 'lib/analytics'; +import { GitHubResource, IGitHubAuthResponse } from 'resource/github'; +import { Message } from 'ui/Message/Message'; +import { Modal } from 'ui/Modal/Modal'; + +import { GitHubAuth } from './GitHubAuth'; + +interface IProps { + docId: number; + isAuthenticated: boolean; + setIsAuthenticated: (isAuthenticated: boolean) => void; + onClose: () => void; +} + +export const GitHubModal: React.FunctionComponent = ({ + docId, + isAuthenticated, + setIsAuthenticated, + onClose, +}) => { + const [errorMessage, setErrorMessage] = useState(null); + + const handleConnectGitHub = useCallback(async () => { + trackClick({ + component: ComponentType.DATADOC_PAGE, + element: ElementType.GITHUB_CONNECT_BUTTON, + }); + + try { + const { data }: { data: IGitHubAuthResponse } = + await GitHubResource.connectGithub(); + const url = data.url; + if (!url) { + throw new Error('Failed to get GitHub authentication URL'); + } + const authWindow = window.open(url); + + const receiveMessage = () => { + authWindow.close(); + delete window.receiveChildMessage; + window.removeEventListener('message', receiveMessage, false); + setIsAuthenticated(true); + }; + window.receiveChildMessage = receiveMessage; + + // If the user closes the authentication window manually, clean up + const timer = setInterval(() => { + if (authWindow.closed) { + clearInterval(timer); + window.removeEventListener( + 'message', + receiveMessage, + false + ); + throw new Error('Authentication process failed'); + } + }, 1000); + } catch (error) { + console.error('GitHub authentication failed:', error); + setErrorMessage('GitHub authentication failed. Please try again.'); + } + }, [setIsAuthenticated]); + + return ( + +
+ {isAuthenticated ? ( + + ) : ( + + )} + {errorMessage && ( + + )} + +
+
+ ); +}; diff --git a/querybook/webapp/components/DataDocRightSidebar/DataDocRightSidebar.tsx b/querybook/webapp/components/DataDocRightSidebar/DataDocRightSidebar.tsx index c22e012f4..99dbb2266 100644 --- a/querybook/webapp/components/DataDocRightSidebar/DataDocRightSidebar.tsx +++ b/querybook/webapp/components/DataDocRightSidebar/DataDocRightSidebar.tsx @@ -3,6 +3,7 @@ import { useDispatch, useSelector } from 'react-redux'; import { DataDocBoardsButton } from 'components/DataDocBoardsButton/DataDocBoardsButton'; import { DataDocDAGExporterButton } from 'components/DataDocDAGExporter/DataDocDAGExporterButton'; +import { DataDocGitHubButton } from 'components/DataDocGitHub/DataDocGitHubButton'; import { DataDocTemplateButton } from 'components/DataDocTemplateButton/DataDocTemplateButton'; import { DataDocUIGuide } from 'components/UIGuide/DataDocUIGuide'; import { ComponentType, ElementType } from 'const/analytics'; @@ -83,6 +84,8 @@ export const DataDocRightSidebar: React.FunctionComponent = ({ ); + const githubButtonDOM = ; + const buttonSection = (
@@ -131,6 +134,7 @@ export const DataDocRightSidebar: React.FunctionComponent = ({
{runAllButtonDOM} + {githubButtonDOM} {isEditable && exporterExists && ( )} diff --git a/querybook/webapp/const/analytics.ts b/querybook/webapp/const/analytics.ts index 7d923b14a..a71412c34 100644 --- a/querybook/webapp/const/analytics.ts +++ b/querybook/webapp/const/analytics.ts @@ -115,6 +115,10 @@ export enum ElementType { QUERY_GENERATION_REJECT_BUTTON = 'QUERY_GENERATION_REJECT_BUTTON', QUERY_GENERATION_APPLY_BUTTON = 'QUERY_GENERATION_APPLY_BUTTON', QUERY_GENERATION_APPLY_AND_RUN_BUTTON = 'QUERY_GENERATION_APPLY_AND_RUN_BUTTON', + + // Github Integration + GITHUB_CONNECT_BUTTON = 'GITHUB_CONNECT_BUTTON', + GITHUB_LINK_BUTTON = 'GITHUB_LINK_BUTTON', } export interface EventData { diff --git a/querybook/webapp/resource/github.ts b/querybook/webapp/resource/github.ts new file mode 100644 index 000000000..678a0f816 --- /dev/null +++ b/querybook/webapp/resource/github.ts @@ -0,0 +1,11 @@ +import ds from 'lib/datasource'; + +export interface IGitHubAuthResponse { + url: string; +} + +export const GitHubResource = { + connectGithub: () => ds.fetch('/github/auth/'), + isAuthenticated: () => + ds.fetch<{ is_authenticated: boolean }>('/github/is_authenticated/'), +}; diff --git a/querybook/webapp/ui/Icon/LucideIcons.ts b/querybook/webapp/ui/Icon/LucideIcons.ts index bac6f3be9..f1c8a12d5 100644 --- a/querybook/webapp/ui/Icon/LucideIcons.ts +++ b/querybook/webapp/ui/Icon/LucideIcons.ts @@ -52,6 +52,7 @@ import { FileText, Filter, FormInput, + Github, GripVertical, Hash, HelpCircle, @@ -167,6 +168,7 @@ const AllLucideIcons = { FileText, Filter, FormInput, + Github, GripVertical, Hash, HelpCircle, diff --git a/webpack.config.js b/webpack.config.js index 8658ef1cd..46b6bfb8a 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -48,6 +48,10 @@ function getDevServerSettings(env) { target: QUERYBOOK_UPSTREAM, changeOrigin: true, }, + '/github/oauth2callback': { + target: QUERYBOOK_UPSTREAM, + changeOrigin: true, + }, }, publicPath: '/build/', onListening: (server) => { From 27dbd018d2c9ae41b12530a9c9a69d8c51176125 Mon Sep 17 00:00:00 2001 From: rongzhang Date: Tue, 29 Oct 2024 16:13:29 +0000 Subject: [PATCH 2/7] address comments --- .../config/querybook_default_config.yaml | 4 ++ querybook/server/datasources/github.py | 4 +- querybook/server/env.py | 4 ++ .../__init__.py | 0 .../github.py} | 59 +++++++------------ 5 files changed, 30 insertions(+), 41 deletions(-) rename querybook/server/lib/{github_integration => github}/__init__.py (100%) rename querybook/server/lib/{github_integration/github_integration.py => github/github.py} (65%) diff --git a/querybook/config/querybook_default_config.yaml b/querybook/config/querybook_default_config.yaml index 42bd611c2..268a12424 100644 --- a/querybook/config/querybook_default_config.yaml +++ b/querybook/config/querybook_default_config.yaml @@ -45,6 +45,10 @@ OAUTH_AUTHORIZATION_URL: ~ OAUTH_TOKEN_URL: ~ OAUTH_USER_PROFILE: ~ +# --------------- GitHub Integration --------------- +GITHUB_CLIENT_ID: ~ +GITHUB_CLIENT_SECRET: ~ + # LDAP LDAP_CONN: ~ LDAP_USER_DN: uid={},dc=example,dc=com diff --git a/querybook/server/datasources/github.py b/querybook/server/datasources/github.py index 82420604c..08601cdf5 100644 --- a/querybook/server/datasources/github.py +++ b/querybook/server/datasources/github.py @@ -1,16 +1,14 @@ from app.datasource import register -from lib.github_integration.github_integration import get_github_manager +from lib.github.github import github_manager from typing import Dict @register("/github/auth/", methods=["GET"]) def connect_github() -> Dict[str, str]: - github_manager = get_github_manager() return github_manager.initiate_github_integration() @register("/github/is_authenticated/", methods=["GET"]) def is_github_authenticated() -> str: - github_manager = get_github_manager() is_authenticated = github_manager.get_github_token() is not None return {"is_authenticated": is_authenticated} diff --git a/querybook/server/env.py b/querybook/server/env.py index 2128717a3..2eedb83d9 100644 --- a/querybook/server/env.py +++ b/querybook/server/env.py @@ -89,6 +89,10 @@ class QuerybookSettings(object): OAUTH_USER_PROFILE = get_env_config("OAUTH_USER_PROFILE") AZURE_TENANT_ID = get_env_config("AZURE_TENANT_ID") + # GitHub App settings for feature integration + GITHUB_CLIENT_ID = os.getenv("GITHUB_CLIENT_ID") + GITHUB_CLIENT_SECRET = os.getenv("GITHUB_CLIENT_SECRET") + LDAP_CONN = get_env_config("LDAP_CONN") LDAP_USE_TLS = str(get_env_config("LDAP_USE_TLS")).lower() == "true" LDAP_USE_BIND_USER = str(get_env_config("LDAP_USE_BIND_USER")).lower() == "true" diff --git a/querybook/server/lib/github_integration/__init__.py b/querybook/server/lib/github/__init__.py similarity index 100% rename from querybook/server/lib/github_integration/__init__.py rename to querybook/server/lib/github/__init__.py diff --git a/querybook/server/lib/github_integration/github_integration.py b/querybook/server/lib/github/github.py similarity index 65% rename from querybook/server/lib/github_integration/github_integration.py rename to querybook/server/lib/github/github.py index 4cced5956..ad3b46232 100644 --- a/querybook/server/lib/github_integration/github_integration.py +++ b/querybook/server/lib/github/github.py @@ -8,13 +8,20 @@ LOG = get_logger(__file__) - GITHUB_OAUTH_CALLBACK = "/github/oauth2callback" +GITHUB_ACCESS_TOKEN = "github_access_token" -class GitHubIntegrationManager(GitHubLoginManager): - def __init__(self, additional_scopes: Optional[list] = None): +class GitHubManager(GitHubLoginManager): + def __init__( + self, + additional_scopes: Optional[list] = None, + client_id: Optional[str] = None, + client_secret: Optional[str] = None, + ): self.additional_scopes = additional_scopes or [] + self._client_id = client_id + self._client_secret = client_secret super().__init__() @property @@ -24,14 +31,18 @@ def oauth_config(self) -> Dict[str, Any]: config[ "callback_url" ] = f"{QuerybookSettings.PUBLIC_URL}{GITHUB_OAUTH_CALLBACK}" + if self._client_id: + config["client_id"] = self._client_id + if self._client_secret: + config["client_secret"] = self._client_secret return config def save_github_token(self, token: str) -> None: - flask_session["github_access_token"] = token + flask_session[GITHUB_ACCESS_TOKEN] = token LOG.debug("Saved GitHub token to session") def get_github_token(self) -> Optional[str]: - return flask_session.get("github_access_token") + return flask_session.get(GITHUB_ACCESS_TOKEN) def initiate_github_integration(self) -> Dict[str, str]: github = self.oauth_session @@ -70,41 +81,13 @@ def error_response(self, error_message: str) -> str: """ -def get_github_manager() -> GitHubIntegrationManager: - return GitHubIntegrationManager(additional_scopes=["repo"]) +github_manager = GitHubManager( + additional_scopes=["repo"], + client_id=QuerybookSettings.GITHUB_CLIENT_ID, + client_secret=QuerybookSettings.GITHUB_CLIENT_SECRET, +) @flask_app.route(GITHUB_OAUTH_CALLBACK) def github_callback() -> str: - github_manager = get_github_manager() return github_manager.github_integration_callback() - - -# Test GitHub OAuth Flow -def main(): - github_manager = GitHubIntegrationManager() - oauth_config = github_manager.oauth_config - client_id = oauth_config["client_id"] - client_secret = oauth_config["client_secret"] - - from requests_oauthlib import OAuth2Session - - github = OAuth2Session(client_id) - authorization_url, state = github.authorization_url( - oauth_config["authorization_url"] - ) - print("Please go here and authorize,", authorization_url) - - redirect_response = input("Paste the full redirect URL here:") - github.fetch_token( - oauth_config["token_url"], - client_secret=client_secret, - authorization_response=redirect_response, - ) - - user_profile = github.get(oauth_config["profile_url"]).json() - print(user_profile) - - -if __name__ == "__main__": - main() From 9932e31cd897ffecb77551bc9a7040b097066920 Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 19:12:05 +0000 Subject: [PATCH 3/7] link datadoc to github directory --- querybook/server/datasources/github.py | 12 ++++++++ querybook/server/logic/github.py | 40 ++++++++++++++++++++++++++ querybook/server/models/__init__.py | 1 + querybook/server/models/github.py | 37 ++++++++++++++++++++++++ 4 files changed, 90 insertions(+) create mode 100644 querybook/server/logic/github.py create mode 100644 querybook/server/models/github.py diff --git a/querybook/server/datasources/github.py b/querybook/server/datasources/github.py index 08601cdf5..50d9af68f 100644 --- a/querybook/server/datasources/github.py +++ b/querybook/server/datasources/github.py @@ -1,6 +1,8 @@ from app.datasource import register from lib.github.github import github_manager from typing import Dict +from logic import github as logic +from flask_login import current_user @register("/github/auth/", methods=["GET"]) @@ -12,3 +14,13 @@ def connect_github() -> Dict[str, str]: def is_github_authenticated() -> str: is_authenticated = github_manager.get_github_token() is not None return {"is_authenticated": is_authenticated} + + +@register("/github/datadocs//link/", methods=["POST"]) +def link_datadoc_to_github( + datadoc_id: int, + directory: str, +) -> Dict: + return logic.create_repo_link( + datadoc_id=datadoc_id, user_id=current_user.id, directory=directory + ) diff --git a/querybook/server/logic/github.py b/querybook/server/logic/github.py new file mode 100644 index 000000000..3415dcc2e --- /dev/null +++ b/querybook/server/logic/github.py @@ -0,0 +1,40 @@ +from app.db import with_session +from models.github import GitHubLink +from models.datadoc import DataDoc + + +@with_session +def create_repo_link( + datadoc_id: int, + user_id: int, + directory: str, + commit=True, + session=None, +): + datadoc = DataDoc.get(id=datadoc_id, session=session) + assert datadoc is not None, f"DataDoc with id {datadoc_id} not found" + + github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) + assert ( + github_link is None + ), f"GitHub link for DataDoc with id {datadoc_id} already exists" + + github_link = GitHubLink.create( + { + "datadoc_id": datadoc_id, + "user_id": user_id, + "directory": directory, + }, + commit=commit, + session=session, + ) + return github_link + + +@with_session +def get_repo_link(datadoc_id: int, session=None): + github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) + assert ( + github_link is not None + ), f"GitHub link for DataDoc with id {datadoc_id} not found" + return github_link diff --git a/querybook/server/models/__init__.py b/querybook/server/models/__init__.py index cf3dce9f2..6550df625 100644 --- a/querybook/server/models/__init__.py +++ b/querybook/server/models/__init__.py @@ -15,3 +15,4 @@ from .data_element import * from .comment import * from .survey import * +from .github import * diff --git a/querybook/server/models/github.py b/querybook/server/models/github.py new file mode 100644 index 000000000..e802a656c --- /dev/null +++ b/querybook/server/models/github.py @@ -0,0 +1,37 @@ +import sqlalchemy as sql +from sqlalchemy.sql import func +from lib.sqlalchemy import CRUDMixin +from sqlalchemy.orm import backref, relationship +from app import db + +Base = db.Base + + +class GitHubLink(Base, CRUDMixin): + __tablename__ = "github_link" + id = sql.Column(sql.Integer, primary_key=True, autoincrement=True) + datadoc_id = sql.Column( + sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True + ) + user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False) + directory = sql.Column(sql.String(255), nullable=False) + created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False) + updated_at = sql.Column( + sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False + ) + + datadoc = relationship( + "DataDoc", + backref=backref("github_link", uselist=False, cascade="all, delete-orphan"), + ) + user = relationship("User", backref=backref("github_link", uselist=False)) + + def to_dict(self): + return { + "id": self.id, + "datadoc_id": self.datadoc_id, + "user_id": self.user_id, + "directory": self.directory, + "created_at": self.created_at, + "updated_at": self.updated_at, + } From 18bd8b3c741c03dcfae7c4b4a4f35f835cd0bd8d Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 20:44:35 +0000 Subject: [PATCH 4/7] 3.35.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 8b9712a2d..2387207b2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "querybook", - "version": "3.34.2", + "version": "3.35.0", "description": "A Big Data Webapp", "private": true, "scripts": { From daa18d2098818028689450cd670053be4fc8fff2 Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 21:05:56 +0000 Subject: [PATCH 5/7] alembic migration --- .../aa328ae9dced_add_github_datadoc_link.py | 56 +++++++++++++++++++ querybook/server/models/github.py | 2 +- 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py diff --git a/querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py b/querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py new file mode 100644 index 000000000..522d9df9d --- /dev/null +++ b/querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py @@ -0,0 +1,56 @@ +"""Add GitHub Datadoc Link + +Revision ID: aa328ae9dced +Revises: f7b11b3e3a95 +Create Date: 2024-10-23 21:04:55.052696 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "aa328ae9dced" +down_revision = "f7b11b3e3a95" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "github_link", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("datadoc_id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column( + "directory", + sa.String(length=255), + nullable=False, + server_default="datadocs", + ), + sa.Column( + "created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False + ), + sa.Column( + "updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False + ), + sa.ForeignKeyConstraint( + ["datadoc_id"], + ["data_doc.id"], + ), + sa.ForeignKeyConstraint( + ["user_id"], + ["user.id"], + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("datadoc_id"), + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("github_link") + # ### end Alembic commands ### diff --git a/querybook/server/models/github.py b/querybook/server/models/github.py index e802a656c..9405c3fcb 100644 --- a/querybook/server/models/github.py +++ b/querybook/server/models/github.py @@ -14,7 +14,7 @@ class GitHubLink(Base, CRUDMixin): sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True ) user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False) - directory = sql.Column(sql.String(255), nullable=False) + directory = sql.Column(sql.String(255), nullable=False, default="datadocs") created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False) updated_at = sql.Column( sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False From bcede5dd663179635bf9ab33646899cf3d29d7ea Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 21:33:32 +0000 Subject: [PATCH 6/7] feat: Add Datadoc serializing util --- querybook/server/lib/github/serializers.py | 118 +++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 querybook/server/lib/github/serializers.py diff --git a/querybook/server/lib/github/serializers.py b/querybook/server/lib/github/serializers.py new file mode 100644 index 000000000..e6eb84924 --- /dev/null +++ b/querybook/server/lib/github/serializers.py @@ -0,0 +1,118 @@ +import yaml +import re +from typing import List +from models.datadoc import DataDoc, DataCell +from const.data_doc import DataCellType + + +def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str: + # Serialize DataDoc metadata to YAML front matter for readability + datadoc_metadata = { + "id": datadoc.id, + "environment_id": datadoc.environment_id, + "public": datadoc.public, + "archived": datadoc.archived, + "owner_uid": datadoc.owner_uid, + "created_at": datadoc.created_at.isoformat() if datadoc.created_at else None, + "updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None, + "meta": datadoc.meta, + "title": datadoc.title, + } + front_matter = ( + f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n" + ) + + title = f"# {datadoc.title}\n\n" + content = serialize_datacells(datadoc.cells) + markdown_content = front_matter + title + content + return markdown_content + + +def serialize_datacells(cells: List[DataCell]) -> str: + lines = [] + for cell in cells: + # Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections, + # we serialize cell metadata in HTML comment to hide it from rendered view + cell_metadata = { + "id": cell.id, + "cell_type": cell.cell_type.name.lower(), + "created_at": cell.created_at.isoformat() if cell.created_at else None, + "updated_at": cell.updated_at.isoformat() if cell.updated_at else None, + "meta": cell.meta, + } + cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False) + cell_metadata_comment = f"\n" + + cell_content = serialize_cell_content(cell) + lines.append(cell_metadata_comment + cell_content) + + return "\n\n".join(lines) + + +def serialize_cell_content(cell: DataCell) -> str: + if cell.cell_type == DataCellType.query: + query_title = cell.meta.get("title", "Query") + return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" + elif cell.cell_type == DataCellType.text: + return f"{cell.context.strip()}\n" + elif cell.cell_type == DataCellType.chart: + return "## Chart\n\n*Chart generated from the metadata.*\n" + + +def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: + front_matter, content = extract_front_matter(markdown_str) + datadoc = create_datadoc_from_metadata(front_matter) + datadoc.cells = deserialize_datadoc_content(content) + return datadoc + + +def extract_front_matter(markdown_str: str): + front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL) + match = front_matter_pattern.match(markdown_str) + if match: + front_matter_str = match.group(1) + content = markdown_str[match.end() :] + front_matter = yaml.safe_load(front_matter_str) + else: + raise ValueError("Invalid Markdown format: Missing front matter.") + return front_matter, content + + +def create_datadoc_from_metadata(metadata: dict) -> DataDoc: + datadoc = DataDoc( + id=metadata.get("id"), + environment_id=metadata.get("environment_id"), + public=metadata.get("public", True), + archived=metadata.get("archived", False), + owner_uid=metadata.get("owner_uid"), + created_at=metadata.get("created_at"), + updated_at=metadata.get("updated_at"), + title=metadata.get("title", ""), + ) + datadoc.meta = metadata.get("meta", {}) + return datadoc + + +def deserialize_datadoc_content(content_str: str) -> List[DataCell]: + cells = [] + # Pattern to match cell metadata in HTML comments and the following content + pattern = re.compile(r"\n(.*?)(?=(\n\n\n" cell_content = serialize_cell_content(cell) @@ -50,13 +69,14 @@ def serialize_datacells(cells: List[DataCell]) -> str: def serialize_cell_content(cell: DataCell) -> str: + cell_meta = cell.meta or {} if cell.cell_type == DataCellType.query: - query_title = cell.meta.get("title", "Query") + query_title = cell_meta.get("title", "Query") return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" elif cell.cell_type == DataCellType.text: - return f"{cell.context.strip()}\n" + return f"## Text\n\n```text\n{cell.context.strip()}\n```\n" elif cell.cell_type == DataCellType.chart: - return "## Chart\n\n*Chart generated from the metadata.*\n" + return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n" def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: @@ -72,7 +92,10 @@ def extract_front_matter(markdown_str: str): if match: front_matter_str = match.group(1) content = markdown_str[match.end() :] - front_matter = yaml.safe_load(front_matter_str) + try: + front_matter = yaml.safe_load(front_matter_str) + except yaml.YAMLError as e: + raise ValueError(f"Error parsing front matter YAML: {e}") else: raise ValueError("Invalid Markdown format: Missing front matter.") return front_matter, content @@ -85,8 +108,8 @@ def create_datadoc_from_metadata(metadata: dict) -> DataDoc: public=metadata.get("public", True), archived=metadata.get("archived", False), owner_uid=metadata.get("owner_uid"), - created_at=metadata.get("created_at"), - updated_at=metadata.get("updated_at"), + created_at=parse_datetime_as_utc(metadata.get("created_at")), + updated_at=parse_datetime_as_utc(metadata.get("updated_at")), title=metadata.get("title", ""), ) datadoc.meta = metadata.get("meta", {}) @@ -96,23 +119,27 @@ def create_datadoc_from_metadata(metadata: dict) -> DataDoc: def deserialize_datadoc_content(content_str: str) -> List[DataCell]: cells = [] # Pattern to match cell metadata in HTML comments and the following content - pattern = re.compile(r"\n(.*?)(?=(\n\n\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL) matches = pattern.finditer(content_str) for match in matches: metadata_str = match.group(1) cell_content = match.group(2) - metadata = yaml.safe_load(metadata_str) - cell_type_str = metadata.get("cell_type", "markdown").lower() + try: + metadata = yaml.safe_load(metadata_str) + except yaml.YAMLError as e: + raise ValueError(f"Error parsing cell metadata YAML: {e}") + + cell_type_str = metadata.get("cell_type", "query").lower() cell_type = DataCellType[cell_type_str] cell = DataCell( id=metadata.get("id"), cell_type=cell_type, - context=cell_content.strip(), - created_at=metadata.get("created_at"), - updated_at=metadata.get("updated_at"), + context=( + cell_content.strip() if cell_type != DataCellType.chart else None + ), # Charts are generated from the metadata, and not from content + created_at=parse_datetime_as_utc(metadata.get("created_at")), + updated_at=parse_datetime_as_utc(metadata.get("updated_at")), meta=metadata.get("meta", {}), ) cells.append(cell) - return cells diff --git a/querybook/tests/test_lib/test_github_integration/test_serializers.py b/querybook/tests/test_lib/test_github_integration/test_serializers.py new file mode 100644 index 000000000..e708a25dd --- /dev/null +++ b/querybook/tests/test_lib/test_github_integration/test_serializers.py @@ -0,0 +1,112 @@ +import pytest +from const.data_doc import DataCellType +from lib.github.serializers import ( + serialize_datadoc_to_markdown, + deserialize_datadoc_from_markdown, +) +from models.datadoc import DataCell, DataDoc +from datetime import datetime, timezone + + +@pytest.fixture +def mock_datadoc(): + cells = [ + DataCell( + id=1, + cell_type=DataCellType.query, + context="SELECT * FROM table;", + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + meta={}, + ), + DataCell( + id=2, + cell_type=DataCellType.text, + context="This is a text cell.", + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + meta={}, + ), + DataCell( + id=3, + cell_type=DataCellType.chart, + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + meta={}, + ), + ] + datadoc = DataDoc( + id=1, + environment_id=1, + public=True, + archived=False, + owner_uid="user1", + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + title="Test DataDoc", + cells=cells, + ) + datadoc.meta = {} + return datadoc + + +def test_serialize_datadoc_to_markdown(mock_datadoc): + expected_markdown = ( + "---\n" + "archived: false\n" + "created_at: '2023-01-01T00:00:00+00:00'\n" + "environment_id: 1\n" + "id: 1\n" + "meta:\n" + " variables: []\n" + "owner_uid: user1\n" + "public: true\n" + "title: Test DataDoc\n" + "updated_at: '2023-01-01T00:00:00+00:00'\n" + "---\n\n" + "# Test DataDoc\n\n" + "\n" + "## Query: Query\n\n" + "```sql\nSELECT * FROM table;\n```\n\n" + "\n" + "## Text\n\n" + "```text\nThis is a text cell.\n```\n\n" + "\n" + "## Chart\n\n" + "```text\n*Chart generated from the metadata.*\n```\n\n" + ) + + serialized = serialize_datadoc_to_markdown(mock_datadoc) + + # Remove any extra newlines for comparison + serialized = "\n".join([line for line in serialized.splitlines() if line.strip()]) + expected_markdown = "\n".join( + [line for line in expected_markdown.splitlines() if line.strip()] + ) + assert serialized == expected_markdown + + +def test_deserialize_datadoc_from_markdown(mock_datadoc): + markdown_str = serialize_datadoc_to_markdown(mock_datadoc) + deserialized = deserialize_datadoc_from_markdown(markdown_str) + assert deserialized.to_dict(with_cells=True) == mock_datadoc.to_dict( + with_cells=True + )