Skip to content

Commit

Permalink
Merge pull request #7 from zhangvi7/github/datadoc-link
Browse files Browse the repository at this point in the history
feat: Link Datadoc to GitHub Directory
  • Loading branch information
zhangvi7 authored Oct 30, 2024
2 parents 27dbd01 + 398cb62 commit 866e733
Show file tree
Hide file tree
Showing 8 changed files with 404 additions and 1 deletion.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "querybook",
"version": "3.34.2",
"version": "3.35.0",
"description": "A Big Data Webapp",
"private": true,
"scripts": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Add GitHub Datadoc Link
Revision ID: aa328ae9dced
Revises: f7b11b3e3a95
Create Date: 2024-10-23 21:04:55.052696
"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "aa328ae9dced"
down_revision = "f7b11b3e3a95"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"github_link",
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
sa.Column("datadoc_id", sa.Integer(), nullable=False),
sa.Column("user_id", sa.Integer(), nullable=False),
sa.Column(
"directory",
sa.String(length=255),
nullable=False,
server_default="datadocs",
),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["datadoc_id"],
["data_doc.id"],
),
sa.ForeignKeyConstraint(
["user_id"],
["user.id"],
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("datadoc_id"),
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("github_link")
# ### end Alembic commands ###
12 changes: 12 additions & 0 deletions querybook/server/datasources/github.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from app.datasource import register
from lib.github.github import github_manager
from typing import Dict
from logic import github as logic
from flask_login import current_user


@register("/github/auth/", methods=["GET"])
Expand All @@ -12,3 +14,13 @@ def connect_github() -> Dict[str, str]:
def is_github_authenticated() -> str:
is_authenticated = github_manager.get_github_token() is not None
return {"is_authenticated": is_authenticated}


@register("/github/datadocs/<int:datadoc_id>/link/", methods=["POST"])
def link_datadoc_to_github(
datadoc_id: int,
directory: str,
) -> Dict:
return logic.create_repo_link(
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory
)
145 changes: 145 additions & 0 deletions querybook/server/lib/github/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import yaml
import re
from typing import List
from models.datadoc import DataDoc, DataCell
from const.data_doc import DataCellType
from datetime import datetime, timezone


def parse_datetime_as_utc(date_str: str) -> datetime:
"""
Parse the given date string to a datetime object in UTC.
"""
if isinstance(date_str, datetime):
return date_str.astimezone(timezone.utc)
if date_str:
return datetime.fromisoformat(date_str).astimezone(timezone.utc)
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc)


def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str:
# Serialize DataDoc metadata to YAML front matter for readability
datadoc_metadata = {
"id": datadoc.id,
"environment_id": datadoc.environment_id,
"public": datadoc.public,
"archived": datadoc.archived,
"owner_uid": datadoc.owner_uid,
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None,
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None,
"meta": datadoc.meta,
"title": datadoc.title,
}
try:
front_matter = (
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n"
)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}")

title = f"# {datadoc.title}\n\n"
content = serialize_datacells(datadoc.cells)
markdown_content = front_matter + title + content
return markdown_content


def serialize_datacells(cells: List[DataCell]) -> str:
lines = []
for cell in cells:
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections,
# we serialize cell metadata in HTML comment to hide it from rendered view
cell_metadata = {
"id": cell.id,
"cell_type": cell.cell_type.name.lower(),
"created_at": cell.created_at.isoformat() if cell.created_at else None,
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None,
"meta": cell.meta,
}
try:
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing cell metadata to YAML: {e}")

cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n"

cell_content = serialize_cell_content(cell)
lines.append(cell_metadata_comment + cell_content)

return "\n\n".join(lines)


def serialize_cell_content(cell: DataCell) -> str:
cell_meta = cell.meta or {}
if cell.cell_type == DataCellType.query:
query_title = cell_meta.get("title", "Query")
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.text:
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.chart:
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n"


def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc:
front_matter, content = extract_front_matter(markdown_str)
datadoc = create_datadoc_from_metadata(front_matter)
datadoc.cells = deserialize_datadoc_content(content)
return datadoc


def extract_front_matter(markdown_str: str):
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL)
match = front_matter_pattern.match(markdown_str)
if match:
front_matter_str = match.group(1)
content = markdown_str[match.end() :]
try:
front_matter = yaml.safe_load(front_matter_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing front matter YAML: {e}")
else:
raise ValueError("Invalid Markdown format: Missing front matter.")
return front_matter, content


def create_datadoc_from_metadata(metadata: dict) -> DataDoc:
datadoc = DataDoc(
id=metadata.get("id"),
environment_id=metadata.get("environment_id"),
public=metadata.get("public", True),
archived=metadata.get("archived", False),
owner_uid=metadata.get("owner_uid"),
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
title=metadata.get("title", ""),
)
datadoc.meta = metadata.get("meta", {})
return datadoc


def deserialize_datadoc_content(content_str: str) -> List[DataCell]:
cells = []
# Pattern to match cell metadata in HTML comments and the following content
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL)
matches = pattern.finditer(content_str)
for match in matches:
metadata_str = match.group(1)
cell_content = match.group(2)
try:
metadata = yaml.safe_load(metadata_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing cell metadata YAML: {e}")

cell_type_str = metadata.get("cell_type", "query").lower()
cell_type = DataCellType[cell_type_str]
cell = DataCell(
id=metadata.get("id"),
cell_type=cell_type,
context=(
cell_content.strip() if cell_type != DataCellType.chart else None
), # Charts are generated from the metadata, and not from content
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
meta=metadata.get("meta", {}),
)
cells.append(cell)
return cells
40 changes: 40 additions & 0 deletions querybook/server/logic/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from app.db import with_session
from models.github import GitHubLink
from models.datadoc import DataDoc


@with_session
def create_repo_link(
datadoc_id: int,
user_id: int,
directory: str,
commit=True,
session=None,
):
datadoc = DataDoc.get(id=datadoc_id, session=session)
assert datadoc is not None, f"DataDoc with id {datadoc_id} not found"

github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is None
), f"GitHub link for DataDoc with id {datadoc_id} already exists"

github_link = GitHubLink.create(
{
"datadoc_id": datadoc_id,
"user_id": user_id,
"directory": directory,
},
commit=commit,
session=session,
)
return github_link


@with_session
def get_repo_link(datadoc_id: int, session=None):
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is not None
), f"GitHub link for DataDoc with id {datadoc_id} not found"
return github_link
1 change: 1 addition & 0 deletions querybook/server/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from .data_element import *
from .comment import *
from .survey import *
from .github import *
37 changes: 37 additions & 0 deletions querybook/server/models/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sqlalchemy as sql
from sqlalchemy.sql import func
from lib.sqlalchemy import CRUDMixin
from sqlalchemy.orm import backref, relationship
from app import db

Base = db.Base


class GitHubLink(Base, CRUDMixin):
__tablename__ = "github_link"
id = sql.Column(sql.Integer, primary_key=True, autoincrement=True)
datadoc_id = sql.Column(
sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True
)
user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False)
directory = sql.Column(sql.String(255), nullable=False, default="datadocs")
created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False)
updated_at = sql.Column(
sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
)

datadoc = relationship(
"DataDoc",
backref=backref("github_link", uselist=False, cascade="all, delete-orphan"),
)
user = relationship("User", backref=backref("github_link", uselist=False))

def to_dict(self):
return {
"id": self.id,
"datadoc_id": self.datadoc_id,
"user_id": self.user_id,
"directory": self.directory,
"created_at": self.created_at,
"updated_at": self.updated_at,
}
Loading

0 comments on commit 866e733

Please sign in to comment.