From 9932e31cd897ffecb77551bc9a7040b097066920 Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 19:12:05 +0000 Subject: [PATCH 1/5] link datadoc to github directory --- querybook/server/datasources/github.py | 12 ++++++++ querybook/server/logic/github.py | 40 ++++++++++++++++++++++++++ querybook/server/models/__init__.py | 1 + querybook/server/models/github.py | 37 ++++++++++++++++++++++++ 4 files changed, 90 insertions(+) create mode 100644 querybook/server/logic/github.py create mode 100644 querybook/server/models/github.py diff --git a/querybook/server/datasources/github.py b/querybook/server/datasources/github.py index 08601cdf5..50d9af68f 100644 --- a/querybook/server/datasources/github.py +++ b/querybook/server/datasources/github.py @@ -1,6 +1,8 @@ from app.datasource import register from lib.github.github import github_manager from typing import Dict +from logic import github as logic +from flask_login import current_user @register("/github/auth/", methods=["GET"]) @@ -12,3 +14,13 @@ def connect_github() -> Dict[str, str]: def is_github_authenticated() -> str: is_authenticated = github_manager.get_github_token() is not None return {"is_authenticated": is_authenticated} + + +@register("/github/datadocs//link/", methods=["POST"]) +def link_datadoc_to_github( + datadoc_id: int, + directory: str, +) -> Dict: + return logic.create_repo_link( + datadoc_id=datadoc_id, user_id=current_user.id, directory=directory + ) diff --git a/querybook/server/logic/github.py b/querybook/server/logic/github.py new file mode 100644 index 000000000..3415dcc2e --- /dev/null +++ b/querybook/server/logic/github.py @@ -0,0 +1,40 @@ +from app.db import with_session +from models.github import GitHubLink +from models.datadoc import DataDoc + + +@with_session +def create_repo_link( + datadoc_id: int, + user_id: int, + directory: str, + commit=True, + session=None, +): + datadoc = DataDoc.get(id=datadoc_id, session=session) + assert datadoc is not None, f"DataDoc with id {datadoc_id} not found" + + github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) + assert ( + github_link is None + ), f"GitHub link for DataDoc with id {datadoc_id} already exists" + + github_link = GitHubLink.create( + { + "datadoc_id": datadoc_id, + "user_id": user_id, + "directory": directory, + }, + commit=commit, + session=session, + ) + return github_link + + +@with_session +def get_repo_link(datadoc_id: int, session=None): + github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) + assert ( + github_link is not None + ), f"GitHub link for DataDoc with id {datadoc_id} not found" + return github_link diff --git a/querybook/server/models/__init__.py b/querybook/server/models/__init__.py index cf3dce9f2..6550df625 100644 --- a/querybook/server/models/__init__.py +++ b/querybook/server/models/__init__.py @@ -15,3 +15,4 @@ from .data_element import * from .comment import * from .survey import * +from .github import * diff --git a/querybook/server/models/github.py b/querybook/server/models/github.py new file mode 100644 index 000000000..e802a656c --- /dev/null +++ b/querybook/server/models/github.py @@ -0,0 +1,37 @@ +import sqlalchemy as sql +from sqlalchemy.sql import func +from lib.sqlalchemy import CRUDMixin +from sqlalchemy.orm import backref, relationship +from app import db + +Base = db.Base + + +class GitHubLink(Base, CRUDMixin): + __tablename__ = "github_link" + id = sql.Column(sql.Integer, primary_key=True, autoincrement=True) + datadoc_id = sql.Column( + sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True + ) + user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False) + directory = sql.Column(sql.String(255), nullable=False) + created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False) + updated_at = sql.Column( + sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False + ) + + datadoc = relationship( + "DataDoc", + backref=backref("github_link", uselist=False, cascade="all, delete-orphan"), + ) + user = relationship("User", backref=backref("github_link", uselist=False)) + + def to_dict(self): + return { + "id": self.id, + "datadoc_id": self.datadoc_id, + "user_id": self.user_id, + "directory": self.directory, + "created_at": self.created_at, + "updated_at": self.updated_at, + } From 18bd8b3c741c03dcfae7c4b4a4f35f835cd0bd8d Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 20:44:35 +0000 Subject: [PATCH 2/5] 3.35.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 8b9712a2d..2387207b2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "querybook", - "version": "3.34.2", + "version": "3.35.0", "description": "A Big Data Webapp", "private": true, "scripts": { From daa18d2098818028689450cd670053be4fc8fff2 Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 21:05:56 +0000 Subject: [PATCH 3/5] alembic migration --- .../aa328ae9dced_add_github_datadoc_link.py | 56 +++++++++++++++++++ querybook/server/models/github.py | 2 +- 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py diff --git a/querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py b/querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py new file mode 100644 index 000000000..522d9df9d --- /dev/null +++ b/querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py @@ -0,0 +1,56 @@ +"""Add GitHub Datadoc Link + +Revision ID: aa328ae9dced +Revises: f7b11b3e3a95 +Create Date: 2024-10-23 21:04:55.052696 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "aa328ae9dced" +down_revision = "f7b11b3e3a95" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "github_link", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("datadoc_id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.Integer(), nullable=False), + sa.Column( + "directory", + sa.String(length=255), + nullable=False, + server_default="datadocs", + ), + sa.Column( + "created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False + ), + sa.Column( + "updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False + ), + sa.ForeignKeyConstraint( + ["datadoc_id"], + ["data_doc.id"], + ), + sa.ForeignKeyConstraint( + ["user_id"], + ["user.id"], + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("datadoc_id"), + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("github_link") + # ### end Alembic commands ### diff --git a/querybook/server/models/github.py b/querybook/server/models/github.py index e802a656c..9405c3fcb 100644 --- a/querybook/server/models/github.py +++ b/querybook/server/models/github.py @@ -14,7 +14,7 @@ class GitHubLink(Base, CRUDMixin): sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True ) user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False) - directory = sql.Column(sql.String(255), nullable=False) + directory = sql.Column(sql.String(255), nullable=False, default="datadocs") created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False) updated_at = sql.Column( sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False From bcede5dd663179635bf9ab33646899cf3d29d7ea Mon Sep 17 00:00:00 2001 From: rongzhang Date: Wed, 23 Oct 2024 21:33:32 +0000 Subject: [PATCH 4/5] feat: Add Datadoc serializing util --- querybook/server/lib/github/serializers.py | 118 +++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 querybook/server/lib/github/serializers.py diff --git a/querybook/server/lib/github/serializers.py b/querybook/server/lib/github/serializers.py new file mode 100644 index 000000000..e6eb84924 --- /dev/null +++ b/querybook/server/lib/github/serializers.py @@ -0,0 +1,118 @@ +import yaml +import re +from typing import List +from models.datadoc import DataDoc, DataCell +from const.data_doc import DataCellType + + +def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str: + # Serialize DataDoc metadata to YAML front matter for readability + datadoc_metadata = { + "id": datadoc.id, + "environment_id": datadoc.environment_id, + "public": datadoc.public, + "archived": datadoc.archived, + "owner_uid": datadoc.owner_uid, + "created_at": datadoc.created_at.isoformat() if datadoc.created_at else None, + "updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None, + "meta": datadoc.meta, + "title": datadoc.title, + } + front_matter = ( + f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n" + ) + + title = f"# {datadoc.title}\n\n" + content = serialize_datacells(datadoc.cells) + markdown_content = front_matter + title + content + return markdown_content + + +def serialize_datacells(cells: List[DataCell]) -> str: + lines = [] + for cell in cells: + # Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections, + # we serialize cell metadata in HTML comment to hide it from rendered view + cell_metadata = { + "id": cell.id, + "cell_type": cell.cell_type.name.lower(), + "created_at": cell.created_at.isoformat() if cell.created_at else None, + "updated_at": cell.updated_at.isoformat() if cell.updated_at else None, + "meta": cell.meta, + } + cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False) + cell_metadata_comment = f"\n" + + cell_content = serialize_cell_content(cell) + lines.append(cell_metadata_comment + cell_content) + + return "\n\n".join(lines) + + +def serialize_cell_content(cell: DataCell) -> str: + if cell.cell_type == DataCellType.query: + query_title = cell.meta.get("title", "Query") + return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" + elif cell.cell_type == DataCellType.text: + return f"{cell.context.strip()}\n" + elif cell.cell_type == DataCellType.chart: + return "## Chart\n\n*Chart generated from the metadata.*\n" + + +def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: + front_matter, content = extract_front_matter(markdown_str) + datadoc = create_datadoc_from_metadata(front_matter) + datadoc.cells = deserialize_datadoc_content(content) + return datadoc + + +def extract_front_matter(markdown_str: str): + front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL) + match = front_matter_pattern.match(markdown_str) + if match: + front_matter_str = match.group(1) + content = markdown_str[match.end() :] + front_matter = yaml.safe_load(front_matter_str) + else: + raise ValueError("Invalid Markdown format: Missing front matter.") + return front_matter, content + + +def create_datadoc_from_metadata(metadata: dict) -> DataDoc: + datadoc = DataDoc( + id=metadata.get("id"), + environment_id=metadata.get("environment_id"), + public=metadata.get("public", True), + archived=metadata.get("archived", False), + owner_uid=metadata.get("owner_uid"), + created_at=metadata.get("created_at"), + updated_at=metadata.get("updated_at"), + title=metadata.get("title", ""), + ) + datadoc.meta = metadata.get("meta", {}) + return datadoc + + +def deserialize_datadoc_content(content_str: str) -> List[DataCell]: + cells = [] + # Pattern to match cell metadata in HTML comments and the following content + pattern = re.compile(r"\n(.*?)(?=(\n\n\n" cell_content = serialize_cell_content(cell) @@ -50,13 +69,14 @@ def serialize_datacells(cells: List[DataCell]) -> str: def serialize_cell_content(cell: DataCell) -> str: + cell_meta = cell.meta or {} if cell.cell_type == DataCellType.query: - query_title = cell.meta.get("title", "Query") + query_title = cell_meta.get("title", "Query") return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" elif cell.cell_type == DataCellType.text: - return f"{cell.context.strip()}\n" + return f"## Text\n\n```text\n{cell.context.strip()}\n```\n" elif cell.cell_type == DataCellType.chart: - return "## Chart\n\n*Chart generated from the metadata.*\n" + return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n" def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: @@ -72,7 +92,10 @@ def extract_front_matter(markdown_str: str): if match: front_matter_str = match.group(1) content = markdown_str[match.end() :] - front_matter = yaml.safe_load(front_matter_str) + try: + front_matter = yaml.safe_load(front_matter_str) + except yaml.YAMLError as e: + raise ValueError(f"Error parsing front matter YAML: {e}") else: raise ValueError("Invalid Markdown format: Missing front matter.") return front_matter, content @@ -85,8 +108,8 @@ def create_datadoc_from_metadata(metadata: dict) -> DataDoc: public=metadata.get("public", True), archived=metadata.get("archived", False), owner_uid=metadata.get("owner_uid"), - created_at=metadata.get("created_at"), - updated_at=metadata.get("updated_at"), + created_at=parse_datetime_as_utc(metadata.get("created_at")), + updated_at=parse_datetime_as_utc(metadata.get("updated_at")), title=metadata.get("title", ""), ) datadoc.meta = metadata.get("meta", {}) @@ -96,23 +119,27 @@ def create_datadoc_from_metadata(metadata: dict) -> DataDoc: def deserialize_datadoc_content(content_str: str) -> List[DataCell]: cells = [] # Pattern to match cell metadata in HTML comments and the following content - pattern = re.compile(r"\n(.*?)(?=(\n\n\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL) matches = pattern.finditer(content_str) for match in matches: metadata_str = match.group(1) cell_content = match.group(2) - metadata = yaml.safe_load(metadata_str) - cell_type_str = metadata.get("cell_type", "markdown").lower() + try: + metadata = yaml.safe_load(metadata_str) + except yaml.YAMLError as e: + raise ValueError(f"Error parsing cell metadata YAML: {e}") + + cell_type_str = metadata.get("cell_type", "query").lower() cell_type = DataCellType[cell_type_str] cell = DataCell( id=metadata.get("id"), cell_type=cell_type, - context=cell_content.strip(), - created_at=metadata.get("created_at"), - updated_at=metadata.get("updated_at"), + context=( + cell_content.strip() if cell_type != DataCellType.chart else None + ), # Charts are generated from the metadata, and not from content + created_at=parse_datetime_as_utc(metadata.get("created_at")), + updated_at=parse_datetime_as_utc(metadata.get("updated_at")), meta=metadata.get("meta", {}), ) cells.append(cell) - return cells diff --git a/querybook/tests/test_lib/test_github_integration/test_serializers.py b/querybook/tests/test_lib/test_github_integration/test_serializers.py new file mode 100644 index 000000000..e708a25dd --- /dev/null +++ b/querybook/tests/test_lib/test_github_integration/test_serializers.py @@ -0,0 +1,112 @@ +import pytest +from const.data_doc import DataCellType +from lib.github.serializers import ( + serialize_datadoc_to_markdown, + deserialize_datadoc_from_markdown, +) +from models.datadoc import DataCell, DataDoc +from datetime import datetime, timezone + + +@pytest.fixture +def mock_datadoc(): + cells = [ + DataCell( + id=1, + cell_type=DataCellType.query, + context="SELECT * FROM table;", + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + meta={}, + ), + DataCell( + id=2, + cell_type=DataCellType.text, + context="This is a text cell.", + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + meta={}, + ), + DataCell( + id=3, + cell_type=DataCellType.chart, + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + meta={}, + ), + ] + datadoc = DataDoc( + id=1, + environment_id=1, + public=True, + archived=False, + owner_uid="user1", + created_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + updated_at=datetime(2023, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + title="Test DataDoc", + cells=cells, + ) + datadoc.meta = {} + return datadoc + + +def test_serialize_datadoc_to_markdown(mock_datadoc): + expected_markdown = ( + "---\n" + "archived: false\n" + "created_at: '2023-01-01T00:00:00+00:00'\n" + "environment_id: 1\n" + "id: 1\n" + "meta:\n" + " variables: []\n" + "owner_uid: user1\n" + "public: true\n" + "title: Test DataDoc\n" + "updated_at: '2023-01-01T00:00:00+00:00'\n" + "---\n\n" + "# Test DataDoc\n\n" + "\n" + "## Query: Query\n\n" + "```sql\nSELECT * FROM table;\n```\n\n" + "\n" + "## Text\n\n" + "```text\nThis is a text cell.\n```\n\n" + "\n" + "## Chart\n\n" + "```text\n*Chart generated from the metadata.*\n```\n\n" + ) + + serialized = serialize_datadoc_to_markdown(mock_datadoc) + + # Remove any extra newlines for comparison + serialized = "\n".join([line for line in serialized.splitlines() if line.strip()]) + expected_markdown = "\n".join( + [line for line in expected_markdown.splitlines() if line.strip()] + ) + assert serialized == expected_markdown + + +def test_deserialize_datadoc_from_markdown(mock_datadoc): + markdown_str = serialize_datadoc_to_markdown(mock_datadoc) + deserialized = deserialize_datadoc_from_markdown(markdown_str) + assert deserialized.to_dict(with_cells=True) == mock_datadoc.to_dict( + with_cells=True + )