-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Link Datadoc to GitHub Directory #7
Changes from all commits
9932e31
18bd8b3
daa18d2
bcede5d
88cc1ff
398cb62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
"""Add GitHub Datadoc Link | ||
Revision ID: aa328ae9dced | ||
Revises: f7b11b3e3a95 | ||
Create Date: 2024-10-23 21:04:55.052696 | ||
""" | ||
|
||
from alembic import op | ||
import sqlalchemy as sa | ||
|
||
|
||
# revision identifiers, used by Alembic. | ||
revision = "aa328ae9dced" | ||
down_revision = "f7b11b3e3a95" | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade(): | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.create_table( | ||
"github_link", | ||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), | ||
sa.Column("datadoc_id", sa.Integer(), nullable=False), | ||
sa.Column("user_id", sa.Integer(), nullable=False), | ||
sa.Column( | ||
"directory", | ||
sa.String(length=255), | ||
nullable=False, | ||
server_default="datadocs", | ||
), | ||
sa.Column( | ||
"created_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False | ||
), | ||
sa.Column( | ||
"updated_at", sa.DateTime(), server_default=sa.text("now()"), nullable=False | ||
), | ||
sa.ForeignKeyConstraint( | ||
["datadoc_id"], | ||
["data_doc.id"], | ||
), | ||
sa.ForeignKeyConstraint( | ||
["user_id"], | ||
["user.id"], | ||
), | ||
sa.PrimaryKeyConstraint("id"), | ||
sa.UniqueConstraint("datadoc_id"), | ||
) | ||
# ### end Alembic commands ### | ||
|
||
|
||
def downgrade(): | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.drop_table("github_link") | ||
# ### end Alembic commands ### |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import yaml | ||
import re | ||
from typing import List | ||
from models.datadoc import DataDoc, DataCell | ||
from const.data_doc import DataCellType | ||
from datetime import datetime, timezone | ||
|
||
|
||
def parse_datetime_as_utc(date_str: str) -> datetime: | ||
""" | ||
Parse the given date string to a datetime object in UTC. | ||
""" | ||
if isinstance(date_str, datetime): | ||
return date_str.astimezone(timezone.utc) | ||
if date_str: | ||
return datetime.fromisoformat(date_str).astimezone(timezone.utc) | ||
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc) | ||
|
||
|
||
def serialize_datadoc_to_markdown(datadoc: DataDoc) -> str: | ||
# Serialize DataDoc metadata to YAML front matter for readability | ||
datadoc_metadata = { | ||
"id": datadoc.id, | ||
"environment_id": datadoc.environment_id, | ||
"public": datadoc.public, | ||
"archived": datadoc.archived, | ||
"owner_uid": datadoc.owner_uid, | ||
"created_at": datadoc.created_at.isoformat() if datadoc.created_at else None, | ||
"updated_at": datadoc.updated_at.isoformat() if datadoc.updated_at else None, | ||
"meta": datadoc.meta, | ||
"title": datadoc.title, | ||
} | ||
try: | ||
front_matter = ( | ||
f"---\n{yaml.dump(datadoc_metadata, default_flow_style=False)}---\n\n" | ||
) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error serializing DataDoc metadata to YAML: {e}") | ||
|
||
title = f"# {datadoc.title}\n\n" | ||
content = serialize_datacells(datadoc.cells) | ||
markdown_content = front_matter + title + content | ||
return markdown_content | ||
|
||
|
||
def serialize_datacells(cells: List[DataCell]) -> str: | ||
lines = [] | ||
for cell in cells: | ||
# Since GitHub's Markdown renderer does not recognize multiple --- blocks as separate YAML sections, | ||
# we serialize cell metadata in HTML comment to hide it from rendered view | ||
cell_metadata = { | ||
"id": cell.id, | ||
"cell_type": cell.cell_type.name.lower(), | ||
"created_at": cell.created_at.isoformat() if cell.created_at else None, | ||
"updated_at": cell.updated_at.isoformat() if cell.updated_at else None, | ||
"meta": cell.meta, | ||
} | ||
try: | ||
cell_metadata_yaml = yaml.dump(cell_metadata, default_flow_style=False) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error serializing cell metadata to YAML: {e}") | ||
|
||
cell_metadata_comment = f"<!--\n{cell_metadata_yaml.strip()}\n-->\n" | ||
|
||
cell_content = serialize_cell_content(cell) | ||
lines.append(cell_metadata_comment + cell_content) | ||
|
||
return "\n\n".join(lines) | ||
|
||
|
||
def serialize_cell_content(cell: DataCell) -> str: | ||
cell_meta = cell.meta or {} | ||
if cell.cell_type == DataCellType.query: | ||
query_title = cell_meta.get("title", "Query") | ||
return f"## Query: {query_title}\n\n```sql\n{cell.context.strip()}\n```\n" | ||
elif cell.cell_type == DataCellType.text: | ||
return f"## Text\n\n```text\n{cell.context.strip()}\n```\n" | ||
elif cell.cell_type == DataCellType.chart: | ||
return "## Chart\n\n```text\n*Chart generated from the metadata.*\n```\n" | ||
|
||
|
||
def deserialize_datadoc_from_markdown(markdown_str: str) -> DataDoc: | ||
front_matter, content = extract_front_matter(markdown_str) | ||
datadoc = create_datadoc_from_metadata(front_matter) | ||
datadoc.cells = deserialize_datadoc_content(content) | ||
return datadoc | ||
|
||
|
||
def extract_front_matter(markdown_str: str): | ||
front_matter_pattern = re.compile(r"^---\n(.*?)\n---\n\n", re.DOTALL) | ||
match = front_matter_pattern.match(markdown_str) | ||
if match: | ||
front_matter_str = match.group(1) | ||
content = markdown_str[match.end() :] | ||
try: | ||
front_matter = yaml.safe_load(front_matter_str) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error parsing front matter YAML: {e}") | ||
else: | ||
raise ValueError("Invalid Markdown format: Missing front matter.") | ||
return front_matter, content | ||
|
||
|
||
def create_datadoc_from_metadata(metadata: dict) -> DataDoc: | ||
datadoc = DataDoc( | ||
id=metadata.get("id"), | ||
environment_id=metadata.get("environment_id"), | ||
public=metadata.get("public", True), | ||
archived=metadata.get("archived", False), | ||
owner_uid=metadata.get("owner_uid"), | ||
created_at=parse_datetime_as_utc(metadata.get("created_at")), | ||
updated_at=parse_datetime_as_utc(metadata.get("updated_at")), | ||
title=metadata.get("title", ""), | ||
) | ||
datadoc.meta = metadata.get("meta", {}) | ||
return datadoc | ||
|
||
|
||
def deserialize_datadoc_content(content_str: str) -> List[DataCell]: | ||
cells = [] | ||
# Pattern to match cell metadata in HTML comments and the following content | ||
pattern = re.compile(r"<!--\n(.*?)\n-->\n## .*?\n\n```.*?\n(.*?)\n```", re.DOTALL) | ||
matches = pattern.finditer(content_str) | ||
for match in matches: | ||
metadata_str = match.group(1) | ||
cell_content = match.group(2) | ||
try: | ||
metadata = yaml.safe_load(metadata_str) | ||
except yaml.YAMLError as e: | ||
raise ValueError(f"Error parsing cell metadata YAML: {e}") | ||
|
||
cell_type_str = metadata.get("cell_type", "query").lower() | ||
cell_type = DataCellType[cell_type_str] | ||
cell = DataCell( | ||
id=metadata.get("id"), | ||
cell_type=cell_type, | ||
context=( | ||
cell_content.strip() if cell_type != DataCellType.chart else None | ||
), # Charts are generated from the metadata, and not from content | ||
created_at=parse_datetime_as_utc(metadata.get("created_at")), | ||
updated_at=parse_datetime_as_utc(metadata.get("updated_at")), | ||
meta=metadata.get("meta", {}), | ||
) | ||
cells.append(cell) | ||
return cells |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from app.db import with_session | ||
from models.github import GitHubLink | ||
from models.datadoc import DataDoc | ||
|
||
|
||
@with_session | ||
def create_repo_link( | ||
datadoc_id: int, | ||
user_id: int, | ||
directory: str, | ||
commit=True, | ||
session=None, | ||
): | ||
datadoc = DataDoc.get(id=datadoc_id, session=session) | ||
assert datadoc is not None, f"DataDoc with id {datadoc_id} not found" | ||
|
||
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) | ||
assert ( | ||
github_link is None | ||
), f"GitHub link for DataDoc with id {datadoc_id} already exists" | ||
|
||
github_link = GitHubLink.create( | ||
{ | ||
"datadoc_id": datadoc_id, | ||
"user_id": user_id, | ||
"directory": directory, | ||
}, | ||
commit=commit, | ||
session=session, | ||
) | ||
return github_link | ||
|
||
|
||
@with_session | ||
def get_repo_link(datadoc_id: int, session=None): | ||
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) | ||
assert ( | ||
github_link is not None | ||
), f"GitHub link for DataDoc with id {datadoc_id} not found" | ||
return github_link |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,3 +15,4 @@ | |
from .data_element import * | ||
from .comment import * | ||
from .survey import * | ||
from .github import * |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import sqlalchemy as sql | ||
from sqlalchemy.sql import func | ||
from lib.sqlalchemy import CRUDMixin | ||
from sqlalchemy.orm import backref, relationship | ||
from app import db | ||
|
||
Base = db.Base | ||
|
||
|
||
class GitHubLink(Base, CRUDMixin): | ||
__tablename__ = "github_link" | ||
id = sql.Column(sql.Integer, primary_key=True, autoincrement=True) | ||
datadoc_id = sql.Column( | ||
sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True | ||
) | ||
user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is this user_id for ? is it the doc owner or the first user commits the datadoc? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its the user who links the current GitHubs directory to the datadoc |
||
directory = sql.Column(sql.String(255), nullable=False, default="datadocs") | ||
created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False) | ||
updated_at = sql.Column( | ||
sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False | ||
) | ||
|
||
datadoc = relationship( | ||
"DataDoc", | ||
backref=backref("github_link", uselist=False, cascade="all, delete-orphan"), | ||
) | ||
user = relationship("User", backref=backref("github_link", uselist=False)) | ||
|
||
def to_dict(self): | ||
return { | ||
"id": self.id, | ||
"datadoc_id": self.datadoc_id, | ||
"user_id": self.user_id, | ||
"directory": self.directory, | ||
"created_at": self.created_at, | ||
"updated_at": self.updated_at, | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
realized that, do we allow user to unlink a datadoc, or change the directory of the doc?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, plan to add support for it later on, with maybe a warning saying it will change version history