-
Notifications
You must be signed in to change notification settings - Fork 240
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from zhangvi7/github/datadoc-link
feat: Link Datadoc to GitHub Directory
- Loading branch information
Showing
8 changed files
with
404 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
querybook/migrations/versions/aa328ae9dced_add_github_datadoc_link.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
"""Add GitHub Datadoc Link | ||
Revision ID: aa328ae9dced | ||
Revises: f7b11b3e3a95 | ||
Create Date: 2024-10-23 21:04:55.052696 | ||
""" | ||
|
||
from alembic import op | ||
import sqlalchemy as sa | ||
|
||
|
||
# revision identifiers, used by Alembic. | ||
revision = "aa328ae9dced" | ||
down_revision = "f7b11b3e3a95" | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade(): | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.create_table( | ||
"github_link", | ||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), | ||
sa.Column("datadoc_id", sa.Integer(), nullable=False), | ||
sa.Column("user_id", sa.Integer(), nullable=False), | ||
sa.Column( | ||
"directory", | ||
sa.String(length=255), | ||
nullable=False, | ||
server_default="datadocs", | ||
), | ||
sa.Column( | ||
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False | ||
), | ||
sa.Column( | ||
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False | ||
), | ||
sa.ForeignKeyConstraint( | ||
["datadoc_id"], | ||
["data_doc.id"], | ||
), | ||
sa.ForeignKeyConstraint( | ||
["user_id"], | ||
["user.id"], | ||
), | ||
sa.PrimaryKeyConstraint("id"), | ||
sa.UniqueConstraint("datadoc_id"), | ||
) | ||
# ### end Alembic commands ### | ||
|
||
|
||
def downgrade(): | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.drop_table("github_link") | ||
# ### end Alembic commands ### |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import yaml | ||
import re | ||
from typing import List | ||
from models.datadoc import DataDoc, DataCell | ||
from const.data_doc import DataCellType | ||
from datetime import datetime, timezone | ||
|
||
|
||
def parse_datetime_as_utc(date_str: str) -> datetime: | ||
""" | ||
Parse the given date string to a datetime object in UTC. | ||
""" | ||
if isinstance(date_str, datetime): | ||
return date_str.astimezone(timezone.utc) | ||
if date_str: | ||
return datetime.fromisoformat(date_str).astimezone(timezone.utc) | ||
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc) | ||
|
||
|
||
def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str: | ||
# Serialize DataDoc metadata to YAML front matter for readability | ||
datadoc_metadata = { | ||
"id": datadoc.id, | ||
"environment_id": datadoc.environment_id, | ||
"public": datadoc.public, | ||
"archived": datadoc.archived, | ||
"owner_uid": datadoc.owner_uid, | ||
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None, | ||
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None, | ||
"meta": datadoc.meta, | ||
"title": datadoc.title, | ||
} | ||
try: | ||
front_matter = ( | ||
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n" | ||
) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}") | ||
|
||
title = f"# {datadoc.title}\n\n" | ||
content = serialize_datacells(datadoc.cells) | ||
markdown_content = front_matter + title + content | ||
return markdown_content | ||
|
||
|
||
def serialize_datacells(cells: List[DataCell]) -> str: | ||
lines = [] | ||
for cell in cells: | ||
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections, | ||
# we serialize cell metadata in HTML comment to hide it from rendered view | ||
cell_metadata = { | ||
"id": cell.id, | ||
"cell_type": cell.cell_type.name.lower(), | ||
"created_at": cell.created_at.isoformat() if cell.created_at else None, | ||
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None, | ||
"meta": cell.meta, | ||
} | ||
try: | ||
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error serializing cell metadata to YAML: {e}") | ||
|
||
cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n" | ||
|
||
cell_content = serialize_cell_content(cell) | ||
lines.append(cell_metadata_comment + cell_content) | ||
|
||
return "\n\n".join(lines) | ||
|
||
|
||
def serialize_cell_content(cell: DataCell) -> str: | ||
cell_meta = cell.meta or {} | ||
if cell.cell_type == DataCellType.query: | ||
query_title = cell_meta.get("title", "Query") | ||
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" | ||
elif cell.cell_type == DataCellType.text: | ||
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n" | ||
elif cell.cell_type == DataCellType.chart: | ||
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n" | ||
|
||
|
||
def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: | ||
front_matter, content = extract_front_matter(markdown_str) | ||
datadoc = create_datadoc_from_metadata(front_matter) | ||
datadoc.cells = deserialize_datadoc_content(content) | ||
return datadoc | ||
|
||
|
||
def extract_front_matter(markdown_str: str): | ||
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL) | ||
match = front_matter_pattern.match(markdown_str) | ||
if match: | ||
front_matter_str = match.group(1) | ||
content = markdown_str[match.end() :] | ||
try: | ||
front_matter = yaml.safe_load(front_matter_str) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error parsing front matter YAML: {e}") | ||
else: | ||
raise ValueError("Invalid Markdown format: Missing front matter.") | ||
return front_matter, content | ||
|
||
|
||
def create_datadoc_from_metadata(metadata: dict) -> DataDoc: | ||
datadoc = DataDoc( | ||
id=metadata.get("id"), | ||
environment_id=metadata.get("environment_id"), | ||
public=metadata.get("public", True), | ||
archived=metadata.get("archived", False), | ||
owner_uid=metadata.get("owner_uid"), | ||
created_at=parse_datetime_as_utc(metadata.get("created_at")), | ||
updated_at=parse_datetime_as_utc(metadata.get("updated_at")), | ||
title=metadata.get("title", ""), | ||
) | ||
datadoc.meta = metadata.get("meta", {}) | ||
return datadoc | ||
|
||
|
||
def deserialize_datadoc_content(content_str: str) -> List[DataCell]: | ||
cells = [] | ||
# Pattern to match cell metadata in HTML comments and the following content | ||
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL) | ||
matches = pattern.finditer(content_str) | ||
for match in matches: | ||
metadata_str = match.group(1) | ||
cell_content = match.group(2) | ||
try: | ||
metadata = yaml.safe_load(metadata_str) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error parsing cell metadata YAML: {e}") | ||
|
||
cell_type_str = metadata.get("cell_type", "query").lower() | ||
cell_type = DataCellType[cell_type_str] | ||
cell = DataCell( | ||
id=metadata.get("id"), | ||
cell_type=cell_type, | ||
context=( | ||
cell_content.strip() if cell_type != DataCellType.chart else None | ||
), # Charts are generated from the metadata, and not from content | ||
created_at=parse_datetime_as_utc(metadata.get("created_at")), | ||
updated_at=parse_datetime_as_utc(metadata.get("updated_at")), | ||
meta=metadata.get("meta", {}), | ||
) | ||
cells.append(cell) | ||
return cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from app.db import with_session | ||
from models.github import GitHubLink | ||
from models.datadoc import DataDoc | ||
|
||
|
||
@with_session | ||
def create_repo_link( | ||
datadoc_id: int, | ||
user_id: int, | ||
directory: str, | ||
commit=True, | ||
session=None, | ||
): | ||
datadoc = DataDoc.get(id=datadoc_id, session=session) | ||
assert datadoc is not None, f"DataDoc with id {datadoc_id} not found" | ||
|
||
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) | ||
assert ( | ||
github_link is None | ||
), f"GitHub link for DataDoc with id {datadoc_id} already exists" | ||
|
||
github_link = GitHubLink.create( | ||
{ | ||
"datadoc_id": datadoc_id, | ||
"user_id": user_id, | ||
"directory": directory, | ||
}, | ||
commit=commit, | ||
session=session, | ||
) | ||
return github_link | ||
|
||
|
||
@with_session | ||
def get_repo_link(datadoc_id: int, session=None): | ||
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) | ||
assert ( | ||
github_link is not None | ||
), f"GitHub link for DataDoc with id {datadoc_id} not found" | ||
return github_link |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,3 +15,4 @@ | |
from .data_element import * | ||
from .comment import * | ||
from .survey import * | ||
from .github import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import sqlalchemy as sql | ||
from sqlalchemy.sql import func | ||
from lib.sqlalchemy import CRUDMixin | ||
from sqlalchemy.orm import backref, relationship | ||
from app import db | ||
|
||
Base = db.Base | ||
|
||
|
||
class GitHubLink(Base, CRUDMixin): | ||
__tablename__ = "github_link" | ||
id = sql.Column(sql.Integer, primary_key=True, autoincrement=True) | ||
datadoc_id = sql.Column( | ||
sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True | ||
) | ||
user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False) | ||
directory = sql.Column(sql.String(255), nullable=False, default="datadocs") | ||
created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False) | ||
updated_at = sql.Column( | ||
sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False | ||
) | ||
|
||
datadoc = relationship( | ||
"DataDoc", | ||
backref=backref("github_link", uselist=False, cascade="all, delete-orphan"), | ||
) | ||
user = relationship("User", backref=backref("github_link", uselist=False)) | ||
|
||
def to_dict(self): | ||
return { | ||
"id": self.id, | ||
"datadoc_id": self.datadoc_id, | ||
"user_id": self.user_id, | ||
"directory": self.directory, | ||
"created_at": self.created_at, | ||
"updated_at": self.updated_at, | ||
} |
Oops, something went wrong.