Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Link Datadoc to GitHub Directory #7

Merged
merged 6 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "querybook",
"version": "3.34.2",
"version": "3.35.0",
"description": "A Big Data Webapp",
"private": true,
"scripts": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Add GitHub Datadoc Link
Revision ID: aa328ae9dced
Revises: f7b11b3e3a95
Create Date: 2024-10-23 21:04:55.052696
"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "aa328ae9dced"
down_revision = "f7b11b3e3a95"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"github_link",
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
sa.Column("datadoc_id", sa.Integer(), nullable=False),
sa.Column("user_id", sa.Integer(), nullable=False),
sa.Column(
"directory",
sa.String(length=255),
nullable=False,
server_default="datadocs",
),
sa.Column(
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.Column(
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False
),
sa.ForeignKeyConstraint(
["datadoc_id"],
["data_doc.id"],
),
sa.ForeignKeyConstraint(
["user_id"],
["user.id"],
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("datadoc_id"),
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("github_link")
# ### end Alembic commands ###
12 changes: 12 additions & 0 deletions querybook/server/datasources/github.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from app.datasource import register
from lib.github.github import github_manager
from typing import Dict
from logic import github as logic
from flask_login import current_user


@register("/github/auth/", methods=["GET"])
Expand All @@ -12,3 +14,13 @@ def connect_github() -> Dict[str, str]:
def is_github_authenticated() -> str:
is_authenticated = github_manager.get_github_token() is not None
return {"is_authenticated": is_authenticated}


@register("/github/datadocs/<int:datadoc_id>/link/", methods=["POST"])
def link_datadoc_to_github(
datadoc_id: int,
directory: str,
) -> Dict:
return logic.create_repo_link(
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory
)
145 changes: 145 additions & 0 deletions querybook/server/lib/github/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import yaml
import re
from typing import List
from models.datadoc import DataDoc, DataCell
from const.data_doc import DataCellType
from datetime import datetime, timezone


def parse_datetime_as_utc(date_str: str) -> datetime:
"""
Parse the given date string to a datetime object in UTC.
"""
if isinstance(date_str, datetime):
return date_str.astimezone(timezone.utc)
if date_str:
return datetime.fromisoformat(date_str).astimezone(timezone.utc)
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc)


def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str:
# Serialize DataDoc metadata to YAML front matter for readability
datadoc_metadata = {
"id": datadoc.id,
"environment_id": datadoc.environment_id,
"public": datadoc.public,
"archived": datadoc.archived,
"owner_uid": datadoc.owner_uid,
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None,
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None,
"meta": datadoc.meta,
"title": datadoc.title,
}
try:
front_matter = (
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n"
)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}")

title = f"# {datadoc.title}\n\n"
content = serialize_datacells(datadoc.cells)
markdown_content = front_matter + title + content
return markdown_content


def serialize_datacells(cells: List[DataCell]) -> str:
lines = []
for cell in cells:
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections,
# we serialize cell metadata in HTML comment to hide it from rendered view
cell_metadata = {
"id": cell.id,
"cell_type": cell.cell_type.name.lower(),
"created_at": cell.created_at.isoformat() if cell.created_at else None,
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None,
"meta": cell.meta,
}
try:
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False)
except yaml.YAMLError as e:
raise ValueError(f"Error serializing cell metadata to YAML: {e}")

cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n"

cell_content = serialize_cell_content(cell)
lines.append(cell_metadata_comment + cell_content)

return "\n\n".join(lines)


def serialize_cell_content(cell: DataCell) -> str:
cell_meta = cell.meta or {}
if cell.cell_type == DataCellType.query:
query_title = cell_meta.get("title", "Query")
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.text:
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n"
elif cell.cell_type == DataCellType.chart:
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n"


def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc:
front_matter, content = extract_front_matter(markdown_str)
datadoc = create_datadoc_from_metadata(front_matter)
datadoc.cells = deserialize_datadoc_content(content)
return datadoc


def extract_front_matter(markdown_str: str):
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL)
match = front_matter_pattern.match(markdown_str)
if match:
front_matter_str = match.group(1)
content = markdown_str[match.end() :]
try:
front_matter = yaml.safe_load(front_matter_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing front matter YAML: {e}")
else:
raise ValueError("Invalid Markdown format: Missing front matter.")
return front_matter, content


def create_datadoc_from_metadata(metadata: dict) -> DataDoc:
datadoc = DataDoc(
id=metadata.get("id"),
environment_id=metadata.get("environment_id"),
public=metadata.get("public", True),
archived=metadata.get("archived", False),
owner_uid=metadata.get("owner_uid"),
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
title=metadata.get("title", ""),
)
datadoc.meta = metadata.get("meta", {})
return datadoc


def deserialize_datadoc_content(content_str: str) -> List[DataCell]:
cells = []
# Pattern to match cell metadata in HTML comments and the following content
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL)
matches = pattern.finditer(content_str)
for match in matches:
metadata_str = match.group(1)
cell_content = match.group(2)
try:
metadata = yaml.safe_load(metadata_str)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing cell metadata YAML: {e}")

cell_type_str = metadata.get("cell_type", "query").lower()
cell_type = DataCellType[cell_type_str]
cell = DataCell(
id=metadata.get("id"),
cell_type=cell_type,
context=(
cell_content.strip() if cell_type != DataCellType.chart else None
), # Charts are generated from the metadata, and not from content
created_at=parse_datetime_as_utc(metadata.get("created_at")),
updated_at=parse_datetime_as_utc(metadata.get("updated_at")),
meta=metadata.get("meta", {}),
)
cells.append(cell)
return cells
40 changes: 40 additions & 0 deletions querybook/server/logic/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from app.db import with_session
from models.github import GitHubLink
from models.datadoc import DataDoc


@with_session
def create_repo_link(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

realized that, do we allow user to unlink a datadoc, or change the directory of the doc?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, plan to add support for it later on, with maybe a warning saying it will change version history

datadoc_id: int,
user_id: int,
directory: str,
commit=True,
session=None,
):
datadoc = DataDoc.get(id=datadoc_id, session=session)
assert datadoc is not None, f"DataDoc with id {datadoc_id} not found"

github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is None
), f"GitHub link for DataDoc with id {datadoc_id} already exists"

github_link = GitHubLink.create(
{
"datadoc_id": datadoc_id,
"user_id": user_id,
"directory": directory,
},
commit=commit,
session=session,
)
return github_link


@with_session
def get_repo_link(datadoc_id: int, session=None):
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is not None
), f"GitHub link for DataDoc with id {datadoc_id} not found"
return github_link
1 change: 1 addition & 0 deletions querybook/server/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from .data_element import *
from .comment import *
from .survey import *
from .github import *
37 changes: 37 additions & 0 deletions querybook/server/models/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sqlalchemy as sql
from sqlalchemy.sql import func
from lib.sqlalchemy import CRUDMixin
from sqlalchemy.orm import backref, relationship
from app import db

Base = db.Base


class GitHubLink(Base, CRUDMixin):
__tablename__ = "github_link"
id = sql.Column(sql.Integer, primary_key=True, autoincrement=True)
datadoc_id = sql.Column(
sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True
)
user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this user_id for ? is it the doc owner or the first user commits the datadoc?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its the user who links the current GitHubs directory to the datadoc

directory = sql.Column(sql.String(255), nullable=False, default="datadocs")
created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False)
updated_at = sql.Column(
sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
)

datadoc = relationship(
"DataDoc",
backref=backref("github_link", uselist=False, cascade="all, delete-orphan"),
)
user = relationship("User", backref=backref("github_link", uselist=False))

def to_dict(self):
return {
"id": self.id,
"datadoc_id": self.datadoc_id,
"user_id": self.user_id,
"directory": self.directory,
"created_at": self.created_at,
"updated_at": self.updated_at,
}
Loading